// Written in the D programming language. /++ $(P The `std.uni` module provides an implementation of fundamental Unicode algorithms and data structures. This doesn't include UTF encoding and decoding primitives, see $(REF decode, std,_utf) and $(REF encode, std,_utf) in $(MREF std, utf) for this functionality. ) $(SCRIPT inhibitQuickIndex = 1;) $(DIVC quickindex, $(BOOKTABLE, $(TR $(TH Category) $(TH Functions)) $(TR $(TD Decode) $(TD $(LREF byCodePoint) $(LREF byGrapheme) $(LREF decodeGrapheme) $(LREF graphemeStride) )) $(TR $(TD Comparison) $(TD $(LREF icmp) $(LREF sicmp) )) $(TR $(TD Classification) $(TD $(LREF isAlpha) $(LREF isAlphaNum) $(LREF isCodepointSet) $(LREF isControl) $(LREF isFormat) $(LREF isGraphical) $(LREF isIntegralPair) $(LREF isMark) $(LREF isNonCharacter) $(LREF isNumber) $(LREF isPrivateUse) $(LREF isPunctuation) $(LREF isSpace) $(LREF isSurrogate) $(LREF isSurrogateHi) $(LREF isSurrogateLo) $(LREF isSymbol) $(LREF isWhite) )) $(TR $(TD Normalization) $(TD $(LREF NFC) $(LREF NFD) $(LREF NFKD) $(LREF NormalizationForm) $(LREF normalize) )) $(TR $(TD Decompose) $(TD $(LREF decompose) $(LREF decomposeHangul) $(LREF UnicodeDecomposition) )) $(TR $(TD Compose) $(TD $(LREF compose) $(LREF composeJamo) )) $(TR $(TD Sets) $(TD $(LREF CodepointInterval) $(LREF CodepointSet) $(LREF InversionList) $(LREF unicode) )) $(TR $(TD Trie) $(TD $(LREF codepointSetTrie) $(LREF CodepointSetTrie) $(LREF codepointTrie) $(LREF CodepointTrie) $(LREF toTrie) $(LREF toDelegate) )) $(TR $(TD Casing) $(TD $(LREF asCapitalized) $(LREF asLowerCase) $(LREF asUpperCase) $(LREF isLower) $(LREF isUpper) $(LREF toLower) $(LREF toLowerInPlace) $(LREF toUpper) $(LREF toUpperInPlace) )) $(TR $(TD Utf8Matcher) $(TD $(LREF isUtfMatcher) $(LREF MatcherConcept) $(LREF utfMatcher) )) $(TR $(TD Separators) $(TD $(LREF lineSep) $(LREF nelSep) $(LREF paraSep) )) $(TR $(TD Building blocks) $(TD $(LREF allowedIn) $(LREF combiningClass) $(LREF Grapheme) )) )) $(P All primitives listed operate on Unicode characters and sets of characters. For functions which operate on ASCII characters and ignore Unicode $(CHARACTERS), see $(MREF std, ascii). For definitions of Unicode $(CHARACTER), $(CODEPOINT) and other terms used throughout this module see the $(S_LINK Terminology, terminology) section below. ) $(P The focus of this module is the core needs of developing Unicode-aware applications. To that effect it provides the following optimized primitives: ) $(UL $(LI Character classification by category and common properties: $(LREF isAlpha), $(LREF isWhite) and others. ) $(LI Case-insensitive string comparison ($(LREF sicmp), $(LREF icmp)). ) $(LI Converting text to any of the four normalization forms via $(LREF normalize). ) $(LI Decoding ($(LREF decodeGrapheme)) and iteration ($(LREF byGrapheme), $(LREF graphemeStride)) by user-perceived characters, that is by $(LREF Grapheme) clusters. ) $(LI Decomposing and composing of individual character(s) according to canonical or compatibility rules, see $(LREF compose) and $(LREF decompose), including the specific version for Hangul syllables $(LREF composeJamo) and $(LREF decomposeHangul). ) ) $(P It's recognized that an application may need further enhancements and extensions, such as less commonly known algorithms, or tailoring existing ones for region specific needs. To help users with building any extra functionality beyond the core primitives, the module provides: ) $(UL $(LI $(LREF CodepointSet), a type for easy manipulation of sets of characters. Besides the typical set algebra it provides an unusual feature: a D source code generator for detection of $(CODEPOINTS) in this set. This is a boon for meta-programming parser frameworks, and is used internally to power classification in small sets like $(LREF isWhite). ) $(LI A way to construct optimal packed multi-stage tables also known as a special case of $(LINK2 https://en.wikipedia.org/wiki/Trie, Trie). The functions $(LREF codepointTrie), $(LREF codepointSetTrie) construct custom tries that map dchar to value. The end result is a fast and predictable $(BIGOH 1) lookup that powers functions like $(LREF isAlpha) and $(LREF combiningClass), but for user-defined data sets. ) $(LI A useful technique for Unicode-aware parsers that perform character classification of encoded $(CODEPOINTS) is to avoid unnecassary decoding at all costs. $(LREF utfMatcher) provides an improvement over the usual workflow of decode-classify-process, combining the decoding and classification steps. By extracting necessary bits directly from encoded $(S_LINK Code unit, code units) matchers achieve significant performance improvements. See $(LREF MatcherConcept) for the common interface of UTF matchers. ) $(LI Generally useful building blocks for customized normalization: $(LREF combiningClass) for querying combining class and $(LREF allowedIn) for testing the Quick_Check property of a given normalization form. ) $(LI Access to a large selection of commonly used sets of $(CODEPOINTS). $(S_LINK Unicode properties, Supported sets) include Script, Block and General Category. The exact contents of a set can be observed in the CLDR utility, on the $(HTTP www.unicode.org/cldr/utility/properties.jsp, property index) page of the Unicode website. See $(LREF unicode) for easy and (optionally) compile-time checked set queries. ) ) $(SECTION Synopsis) --- import std.uni; void main() { // initialize code point sets using script/block or property name // now 'set' contains code points from both scripts. auto set = unicode("Cyrillic") | unicode("Armenian"); // same thing but simpler and checked at compile-time auto ascii = unicode.ASCII; auto currency = unicode.Currency_Symbol; // easy set ops auto a = set & ascii; assert(a.empty); // as it has no intersection with ascii a = set | ascii; auto b = currency - a; // subtract all ASCII, Cyrillic and Armenian // some properties of code point sets assert(b.length > 45); // 46 items in Unicode 6.1, even more in 6.2 // testing presence of a code point in a set // is just fine, it is O(logN) assert(!b['$']); assert(!b['\u058F']); // Armenian dram sign assert(b['¥']); // building fast lookup tables, these guarantee O(1) complexity // 1-level Trie lookup table essentially a huge bit-set ~262Kb auto oneTrie = toTrie!1(b); // 2-level far more compact but typically slightly slower auto twoTrie = toTrie!2(b); // 3-level even smaller, and a bit slower yet auto threeTrie = toTrie!3(b); assert(oneTrie['£']); assert(twoTrie['£']); assert(threeTrie['£']); // build the trie with the most sensible trie level // and bind it as a functor auto cyrillicOrArmenian = toDelegate(set); auto balance = find!(cyrillicOrArmenian)("Hello ընկեր!"); assert(balance == "ընկեր!"); // compatible with bool delegate(dchar) bool delegate(dchar) bindIt = cyrillicOrArmenian; // Normalization string s = "Plain ascii (and not only), is always normalized!"; assert(s is normalize(s));// is the same string string nonS = "A\u0308ffin"; // A ligature auto nS = normalize(nonS); // to NFC, the W3C endorsed standard assert(nS == "Äffin"); assert(nS != nonS); string composed = "Äffin"; assert(normalize!NFD(composed) == "A\u0308ffin"); // to NFKD, compatibility decomposition useful for fuzzy matching/searching assert(normalize!NFKD("2¹⁰") == "210"); } --- $(SECTION Terminology) $(P The following is a list of important Unicode notions and definitions. Any conventions used specifically in this module alone are marked as such. The descriptions are based on the formal definition as found in $(HTTP www.unicode.org/versions/Unicode6.2.0/ch03.pdf, chapter three of The Unicode Standard Core Specification.) ) $(P $(DEF Abstract character) A unit of information used for the organization, control, or representation of textual data. Note that: $(UL $(LI When representing data, the nature of that data is generally symbolic as opposed to some other kind of data (for example, visual). ) $(LI An abstract character has no concrete form and should not be confused with a $(S_LINK Glyph, glyph). ) $(LI An abstract character does not necessarily correspond to what a user thinks of as a “character” and should not be confused with a $(LREF Grapheme). ) $(LI The abstract characters encoded (see Encoded character) are known as Unicode abstract characters. ) $(LI Abstract characters not directly encoded by the Unicode Standard can often be represented by the use of combining character sequences. ) ) ) $(P $(DEF Canonical decomposition) The decomposition of a character or character sequence that results from recursively applying the canonical mappings found in the Unicode Character Database and these described in Conjoining Jamo Behavior (section 12 of $(HTTP www.unicode.org/uni2book/ch03.pdf, Unicode Conformance)). ) $(P $(DEF Canonical composition) The precise definition of the Canonical composition is the algorithm as specified in $(HTTP www.unicode.org/uni2book/ch03.pdf, Unicode Conformance) section 11. Informally it's the process that does the reverse of the canonical decomposition with the addition of certain rules that e.g. prevent legacy characters from appearing in the composed result. ) $(P $(DEF Canonical equivalent) Two character sequences are said to be canonical equivalents if their full canonical decompositions are identical. ) $(P $(DEF Character) Typically differs by context. For the purpose of this documentation the term $(I character) implies $(I encoded character), that is, a code point having an assigned abstract character (a symbolic meaning). ) $(P $(DEF Code point) Any value in the Unicode codespace; that is, the range of integers from 0 to 10FFFF (hex). Not all code points are assigned to encoded characters. ) $(P $(DEF Code unit) The minimal bit combination that can represent a unit of encoded text for processing or interchange. Depending on the encoding this could be: 8-bit code units in the UTF-8 (`char`), 16-bit code units in the UTF-16 (`wchar`), and 32-bit code units in the UTF-32 (`dchar`). $(I Note that in UTF-32, a code unit is a code point and is represented by the D `dchar` type.) ) $(P $(DEF Combining character) A character with the General Category of Combining Mark(M). $(UL $(LI All characters with non-zero canonical combining class are combining characters, but the reverse is not the case: there are combining characters with a zero combining class. ) $(LI These characters are not normally used in isolation unless they are being described. They include such characters as accents, diacritics, Hebrew points, Arabic vowel signs, and Indic matras. ) ) ) $(P $(DEF Combining class) A numerical value used by the Unicode Canonical Ordering Algorithm to determine which sequences of combining marks are to be considered canonically equivalent and which are not. ) $(P $(DEF Compatibility decomposition) The decomposition of a character or character sequence that results from recursively applying both the compatibility mappings and the canonical mappings found in the Unicode Character Database, and those described in Conjoining Jamo Behavior no characters can be further decomposed. ) $(P $(DEF Compatibility equivalent) Two character sequences are said to be compatibility equivalents if their full compatibility decompositions are identical. ) $(P $(DEF Encoded character) An association (or mapping) between an abstract character and a code point. ) $(P $(DEF Glyph) The actual, concrete image of a glyph representation having been rasterized or otherwise imaged onto some display surface. ) $(P $(DEF Grapheme base) A character with the property Grapheme_Base, or any standard Korean syllable block. ) $(P $(DEF Grapheme cluster) Defined as the text between grapheme boundaries as specified by Unicode Standard Annex #29, $(HTTP www.unicode.org/reports/tr29/, Unicode text segmentation). Important general properties of a grapheme: $(UL $(LI The grapheme cluster represents a horizontally segmentable unit of text, consisting of some grapheme base (which may consist of a Korean syllable) together with any number of nonspacing marks applied to it. ) $(LI A grapheme cluster typically starts with a grapheme base and then extends across any subsequent sequence of nonspacing marks. A grapheme cluster is most directly relevant to text rendering and processes such as cursor placement and text selection in editing, but may also be relevant to comparison and searching. ) $(LI For many processes, a grapheme cluster behaves as if it was a single character with the same properties as its grapheme base. Effectively, nonspacing marks apply $(I graphically) to the base, but do not change its properties. ) ) $(P This module defines a number of primitives that work with graphemes: $(LREF Grapheme), $(LREF decodeGrapheme) and $(LREF graphemeStride). All of them are using $(I extended grapheme) boundaries as defined in the aforementioned standard annex. ) ) $(P $(DEF Nonspacing mark) A combining character with the General Category of Nonspacing Mark (Mn) or Enclosing Mark (Me). ) $(P $(DEF Spacing mark) A combining character that is not a nonspacing mark. ) $(SECTION Normalization) $(P The concepts of $(S_LINK Canonical equivalent, canonical equivalent) or $(S_LINK Compatibility equivalent, compatibility equivalent) characters in the Unicode Standard make it necessary to have a full, formal definition of equivalence for Unicode strings. String equivalence is determined by a process called normalization, whereby strings are converted into forms which are compared directly for identity. This is the primary goal of the normalization process, see the function $(LREF normalize) to convert into any of the four defined forms. ) $(P A very important attribute of the Unicode Normalization Forms is that they must remain stable between versions of the Unicode Standard. A Unicode string normalized to a particular Unicode Normalization Form in one version of the standard is guaranteed to remain in that Normalization Form for implementations of future versions of the standard. ) $(P The Unicode Standard specifies four normalization forms. Informally, two of these forms are defined by maximal decomposition of equivalent sequences, and two of these forms are defined by maximal $(I composition) of equivalent sequences. $(UL $(LI Normalization Form D (NFD): The $(S_LINK Canonical decomposition, canonical decomposition) of a character sequence.) $(LI Normalization Form KD (NFKD): The $(S_LINK Compatibility decomposition, compatibility decomposition) of a character sequence.) $(LI Normalization Form C (NFC): The canonical composition of the $(S_LINK Canonical decomposition, canonical decomposition) of a coded character sequence.) $(LI Normalization Form KC (NFKC): The canonical composition of the $(S_LINK Compatibility decomposition, compatibility decomposition) of a character sequence) ) ) $(P The choice of the normalization form depends on the particular use case. NFC is the best form for general text, since it's more compatible with strings converted from legacy encodings. NFKC is the preferred form for identifiers, especially where there are security concerns. NFD and NFKD are the most useful for internal processing. ) $(SECTION Construction of lookup tables) $(P The Unicode standard describes a set of algorithms that depend on having the ability to quickly look up various properties of a code point. Given the the codespace of about 1 million $(CODEPOINTS), it is not a trivial task to provide a space-efficient solution for the multitude of properties. ) $(P Common approaches such as hash-tables or binary search over sorted code point intervals (as in $(LREF InversionList)) are insufficient. Hash-tables have enormous memory footprint and binary search over intervals is not fast enough for some heavy-duty algorithms. ) $(P The recommended solution (see Unicode Implementation Guidelines) is using multi-stage tables that are an implementation of the $(HTTP en.wikipedia.org/wiki/Trie, Trie) data structure with integer keys and a fixed number of stages. For the remainder of the section this will be called a fixed trie. The following describes a particular implementation that is aimed for the speed of access at the expense of ideal size savings. ) $(P Taking a 2-level Trie as an example the principle of operation is as follows. Split the number of bits in a key (code point, 21 bits) into 2 components (e.g. 15 and 8). The first is the number of bits in the index of the trie and the other is number of bits in each page of the trie. The layout of the trie is then an array of size 2^^bits-of-index followed an array of memory chunks of size 2^^bits-of-page/bits-per-element. ) $(P The number of pages is variable (but not less then 1) unlike the number of entries in the index. The slots of the index all have to contain a number of a page that is present. The lookup is then just a couple of operations - slice the upper bits, lookup an index for these, take a page at this index and use the lower bits as an offset within this page. Assuming that pages are laid out consequently in one array at `pages`, the pseudo-code is: ) --- auto elemsPerPage = (2 ^^ bits_per_page) / Value.sizeOfInBits; pages[index[n >> bits_per_page]][n & (elemsPerPage - 1)]; --- $(P Where if `elemsPerPage` is a power of 2 the whole process is a handful of simple instructions and 2 array reads. Subsequent levels of the trie are introduced by recursing on this notion - the index array is treated as values. The number of bits in index is then again split into 2 parts, with pages over 'current-index' and the new 'upper-index'. ) $(P For completeness a level 1 trie is simply an array. The current implementation takes advantage of bit-packing values when the range is known to be limited in advance (such as `bool`). See also $(LREF BitPacked) for enforcing it manually. The major size advantage however comes from the fact that multiple $(B identical pages on every level are merged) by construction. ) $(P The process of constructing a trie is more involved and is hidden from the user in a form of the convenience functions $(LREF codepointTrie), $(LREF codepointSetTrie) and the even more convenient $(LREF toTrie). In general a set or built-in AA with `dchar` type can be turned into a trie. The trie object in this module is read-only (immutable); it's effectively frozen after construction. ) $(SECTION Unicode properties) $(P This is a full list of Unicode properties accessible through $(LREF unicode) with specific helpers per category nested within. Consult the $(HTTP www.unicode.org/cldr/utility/properties.jsp, CLDR utility) when in doubt about the contents of a particular set. ) $(P General category sets listed below are only accessible with the $(LREF unicode) shorthand accessor.) $(BOOKTABLE $(B General category ), $(TR $(TH Abb.) $(TH Long form) $(TH Abb.) $(TH Long form)$(TH Abb.) $(TH Long form)) $(TR $(TD L) $(TD Letter) $(TD Cn) $(TD Unassigned) $(TD Po) $(TD Other_Punctuation)) $(TR $(TD Ll) $(TD Lowercase_Letter) $(TD Co) $(TD Private_Use) $(TD Ps) $(TD Open_Punctuation)) $(TR $(TD Lm) $(TD Modifier_Letter) $(TD Cs) $(TD Surrogate) $(TD S) $(TD Symbol)) $(TR $(TD Lo) $(TD Other_Letter) $(TD N) $(TD Number) $(TD Sc) $(TD Currency_Symbol)) $(TR $(TD Lt) $(TD Titlecase_Letter) $(TD Nd) $(TD Decimal_Number) $(TD Sk) $(TD Modifier_Symbol)) $(TR $(TD Lu) $(TD Uppercase_Letter) $(TD Nl) $(TD Letter_Number) $(TD Sm) $(TD Math_Symbol)) $(TR $(TD M) $(TD Mark) $(TD No) $(TD Other_Number) $(TD So) $(TD Other_Symbol)) $(TR $(TD Mc) $(TD Spacing_Mark) $(TD P) $(TD Punctuation) $(TD Z) $(TD Separator)) $(TR $(TD Me) $(TD Enclosing_Mark) $(TD Pc) $(TD Connector_Punctuation) $(TD Zl) $(TD Line_Separator)) $(TR $(TD Mn) $(TD Nonspacing_Mark) $(TD Pd) $(TD Dash_Punctuation) $(TD Zp) $(TD Paragraph_Separator)) $(TR $(TD C) $(TD Other) $(TD Pe) $(TD Close_Punctuation) $(TD Zs) $(TD Space_Separator)) $(TR $(TD Cc) $(TD Control) $(TD Pf) $(TD Final_Punctuation) $(TD -) $(TD Any)) $(TR $(TD Cf) $(TD Format) $(TD Pi) $(TD Initial_Punctuation) $(TD -) $(TD ASCII)) ) $(P Sets for other commonly useful properties that are accessible with $(LREF unicode):) $(BOOKTABLE $(B Common binary properties), $(TR $(TH Name) $(TH Name) $(TH Name)) $(TR $(TD Alphabetic) $(TD Ideographic) $(TD Other_Uppercase)) $(TR $(TD ASCII_Hex_Digit) $(TD IDS_Binary_Operator) $(TD Pattern_Syntax)) $(TR $(TD Bidi_Control) $(TD ID_Start) $(TD Pattern_White_Space)) $(TR $(TD Cased) $(TD IDS_Trinary_Operator) $(TD Quotation_Mark)) $(TR $(TD Case_Ignorable) $(TD Join_Control) $(TD Radical)) $(TR $(TD Dash) $(TD Logical_Order_Exception) $(TD Soft_Dotted)) $(TR $(TD Default_Ignorable_Code_Point) $(TD Lowercase) $(TD STerm)) $(TR $(TD Deprecated) $(TD Math) $(TD Terminal_Punctuation)) $(TR $(TD Diacritic) $(TD Noncharacter_Code_Point) $(TD Unified_Ideograph)) $(TR $(TD Extender) $(TD Other_Alphabetic) $(TD Uppercase)) $(TR $(TD Grapheme_Base) $(TD Other_Default_Ignorable_Code_Point) $(TD Variation_Selector)) $(TR $(TD Grapheme_Extend) $(TD Other_Grapheme_Extend) $(TD White_Space)) $(TR $(TD Grapheme_Link) $(TD Other_ID_Continue) $(TD XID_Continue)) $(TR $(TD Hex_Digit) $(TD Other_ID_Start) $(TD XID_Start)) $(TR $(TD Hyphen) $(TD Other_Lowercase) ) $(TR $(TD ID_Continue) $(TD Other_Math) ) ) $(P Below is the table with block names accepted by $(LREF unicode.block). Note that the shorthand version $(LREF unicode) requires "In" to be prepended to the names of blocks so as to disambiguate scripts and blocks. ) $(BOOKTABLE $(B Blocks), $(TR $(TD Aegean Numbers) $(TD Ethiopic Extended) $(TD Mongolian)) $(TR $(TD Alchemical Symbols) $(TD Ethiopic Extended-A) $(TD Musical Symbols)) $(TR $(TD Alphabetic Presentation Forms) $(TD Ethiopic Supplement) $(TD Myanmar)) $(TR $(TD Ancient Greek Musical Notation) $(TD General Punctuation) $(TD Myanmar Extended-A)) $(TR $(TD Ancient Greek Numbers) $(TD Geometric Shapes) $(TD New Tai Lue)) $(TR $(TD Ancient Symbols) $(TD Georgian) $(TD NKo)) $(TR $(TD Arabic) $(TD Georgian Supplement) $(TD Number Forms)) $(TR $(TD Arabic Extended-A) $(TD Glagolitic) $(TD Ogham)) $(TR $(TD Arabic Mathematical Alphabetic Symbols) $(TD Gothic) $(TD Ol Chiki)) $(TR $(TD Arabic Presentation Forms-A) $(TD Greek and Coptic) $(TD Old Italic)) $(TR $(TD Arabic Presentation Forms-B) $(TD Greek Extended) $(TD Old Persian)) $(TR $(TD Arabic Supplement) $(TD Gujarati) $(TD Old South Arabian)) $(TR $(TD Armenian) $(TD Gurmukhi) $(TD Old Turkic)) $(TR $(TD Arrows) $(TD Halfwidth and Fullwidth Forms) $(TD Optical Character Recognition)) $(TR $(TD Avestan) $(TD Hangul Compatibility Jamo) $(TD Oriya)) $(TR $(TD Balinese) $(TD Hangul Jamo) $(TD Osmanya)) $(TR $(TD Bamum) $(TD Hangul Jamo Extended-A) $(TD Phags-pa)) $(TR $(TD Bamum Supplement) $(TD Hangul Jamo Extended-B) $(TD Phaistos Disc)) $(TR $(TD Basic Latin) $(TD Hangul Syllables) $(TD Phoenician)) $(TR $(TD Batak) $(TD Hanunoo) $(TD Phonetic Extensions)) $(TR $(TD Bengali) $(TD Hebrew) $(TD Phonetic Extensions Supplement)) $(TR $(TD Block Elements) $(TD High Private Use Surrogates) $(TD Playing Cards)) $(TR $(TD Bopomofo) $(TD High Surrogates) $(TD Private Use Area)) $(TR $(TD Bopomofo Extended) $(TD Hiragana) $(TD Rejang)) $(TR $(TD Box Drawing) $(TD Ideographic Description Characters) $(TD Rumi Numeral Symbols)) $(TR $(TD Brahmi) $(TD Imperial Aramaic) $(TD Runic)) $(TR $(TD Braille Patterns) $(TD Inscriptional Pahlavi) $(TD Samaritan)) $(TR $(TD Buginese) $(TD Inscriptional Parthian) $(TD Saurashtra)) $(TR $(TD Buhid) $(TD IPA Extensions) $(TD Sharada)) $(TR $(TD Byzantine Musical Symbols) $(TD Javanese) $(TD Shavian)) $(TR $(TD Carian) $(TD Kaithi) $(TD Sinhala)) $(TR $(TD Chakma) $(TD Kana Supplement) $(TD Small Form Variants)) $(TR $(TD Cham) $(TD Kanbun) $(TD Sora Sompeng)) $(TR $(TD Cherokee) $(TD Kangxi Radicals) $(TD Spacing Modifier Letters)) $(TR $(TD CJK Compatibility) $(TD Kannada) $(TD Specials)) $(TR $(TD CJK Compatibility Forms) $(TD Katakana) $(TD Sundanese)) $(TR $(TD CJK Compatibility Ideographs) $(TD Katakana Phonetic Extensions) $(TD Sundanese Supplement)) $(TR $(TD CJK Compatibility Ideographs Supplement) $(TD Kayah Li) $(TD Superscripts and Subscripts)) $(TR $(TD CJK Radicals Supplement) $(TD Kharoshthi) $(TD Supplemental Arrows-A)) $(TR $(TD CJK Strokes) $(TD Khmer) $(TD Supplemental Arrows-B)) $(TR $(TD CJK Symbols and Punctuation) $(TD Khmer Symbols) $(TD Supplemental Mathematical Operators)) $(TR $(TD CJK Unified Ideographs) $(TD Lao) $(TD Supplemental Punctuation)) $(TR $(TD CJK Unified Ideographs Extension A) $(TD Latin-1 Supplement) $(TD Supplementary Private Use Area-A)) $(TR $(TD CJK Unified Ideographs Extension B) $(TD Latin Extended-A) $(TD Supplementary Private Use Area-B)) $(TR $(TD CJK Unified Ideographs Extension C) $(TD Latin Extended Additional) $(TD Syloti Nagri)) $(TR $(TD CJK Unified Ideographs Extension D) $(TD Latin Extended-B) $(TD Syriac)) $(TR $(TD Combining Diacritical Marks) $(TD Latin Extended-C) $(TD Tagalog)) $(TR $(TD Combining Diacritical Marks for Symbols) $(TD Latin Extended-D) $(TD Tagbanwa)) $(TR $(TD Combining Diacritical Marks Supplement) $(TD Lepcha) $(TD Tags)) $(TR $(TD Combining Half Marks) $(TD Letterlike Symbols) $(TD Tai Le)) $(TR $(TD Common Indic Number Forms) $(TD Limbu) $(TD Tai Tham)) $(TR $(TD Control Pictures) $(TD Linear B Ideograms) $(TD Tai Viet)) $(TR $(TD Coptic) $(TD Linear B Syllabary) $(TD Tai Xuan Jing Symbols)) $(TR $(TD Counting Rod Numerals) $(TD Lisu) $(TD Takri)) $(TR $(TD Cuneiform) $(TD Low Surrogates) $(TD Tamil)) $(TR $(TD Cuneiform Numbers and Punctuation) $(TD Lycian) $(TD Telugu)) $(TR $(TD Currency Symbols) $(TD Lydian) $(TD Thaana)) $(TR $(TD Cypriot Syllabary) $(TD Mahjong Tiles) $(TD Thai)) $(TR $(TD Cyrillic) $(TD Malayalam) $(TD Tibetan)) $(TR $(TD Cyrillic Extended-A) $(TD Mandaic) $(TD Tifinagh)) $(TR $(TD Cyrillic Extended-B) $(TD Mathematical Alphanumeric Symbols) $(TD Transport And Map Symbols)) $(TR $(TD Cyrillic Supplement) $(TD Mathematical Operators) $(TD Ugaritic)) $(TR $(TD Deseret) $(TD Meetei Mayek) $(TD Unified Canadian Aboriginal Syllabics)) $(TR $(TD Devanagari) $(TD Meetei Mayek Extensions) $(TD Unified Canadian Aboriginal Syllabics Extended)) $(TR $(TD Devanagari Extended) $(TD Meroitic Cursive) $(TD Vai)) $(TR $(TD Dingbats) $(TD Meroitic Hieroglyphs) $(TD Variation Selectors)) $(TR $(TD Domino Tiles) $(TD Miao) $(TD Variation Selectors Supplement)) $(TR $(TD Egyptian Hieroglyphs) $(TD Miscellaneous Mathematical Symbols-A) $(TD Vedic Extensions)) $(TR $(TD Emoticons) $(TD Miscellaneous Mathematical Symbols-B) $(TD Vertical Forms)) $(TR $(TD Enclosed Alphanumerics) $(TD Miscellaneous Symbols) $(TD Yijing Hexagram Symbols)) $(TR $(TD Enclosed Alphanumeric Supplement) $(TD Miscellaneous Symbols and Arrows) $(TD Yi Radicals)) $(TR $(TD Enclosed CJK Letters and Months) $(TD Miscellaneous Symbols And Pictographs) $(TD Yi Syllables)) $(TR $(TD Enclosed Ideographic Supplement) $(TD Miscellaneous Technical) ) $(TR $(TD Ethiopic) $(TD Modifier Tone Letters) ) ) $(P Below is the table with script names accepted by $(LREF unicode.script) and by the shorthand version $(LREF unicode):) $(BOOKTABLE $(B Scripts), $(TR $(TD Arabic) $(TD Hanunoo) $(TD Old_Italic)) $(TR $(TD Armenian) $(TD Hebrew) $(TD Old_Persian)) $(TR $(TD Avestan) $(TD Hiragana) $(TD Old_South_Arabian)) $(TR $(TD Balinese) $(TD Imperial_Aramaic) $(TD Old_Turkic)) $(TR $(TD Bamum) $(TD Inherited) $(TD Oriya)) $(TR $(TD Batak) $(TD Inscriptional_Pahlavi) $(TD Osmanya)) $(TR $(TD Bengali) $(TD Inscriptional_Parthian) $(TD Phags_Pa)) $(TR $(TD Bopomofo) $(TD Javanese) $(TD Phoenician)) $(TR $(TD Brahmi) $(TD Kaithi) $(TD Rejang)) $(TR $(TD Braille) $(TD Kannada) $(TD Runic)) $(TR $(TD Buginese) $(TD Katakana) $(TD Samaritan)) $(TR $(TD Buhid) $(TD Kayah_Li) $(TD Saurashtra)) $(TR $(TD Canadian_Aboriginal) $(TD Kharoshthi) $(TD Sharada)) $(TR $(TD Carian) $(TD Khmer) $(TD Shavian)) $(TR $(TD Chakma) $(TD Lao) $(TD Sinhala)) $(TR $(TD Cham) $(TD Latin) $(TD Sora_Sompeng)) $(TR $(TD Cherokee) $(TD Lepcha) $(TD Sundanese)) $(TR $(TD Common) $(TD Limbu) $(TD Syloti_Nagri)) $(TR $(TD Coptic) $(TD Linear_B) $(TD Syriac)) $(TR $(TD Cuneiform) $(TD Lisu) $(TD Tagalog)) $(TR $(TD Cypriot) $(TD Lycian) $(TD Tagbanwa)) $(TR $(TD Cyrillic) $(TD Lydian) $(TD Tai_Le)) $(TR $(TD Deseret) $(TD Malayalam) $(TD Tai_Tham)) $(TR $(TD Devanagari) $(TD Mandaic) $(TD Tai_Viet)) $(TR $(TD Egyptian_Hieroglyphs) $(TD Meetei_Mayek) $(TD Takri)) $(TR $(TD Ethiopic) $(TD Meroitic_Cursive) $(TD Tamil)) $(TR $(TD Georgian) $(TD Meroitic_Hieroglyphs) $(TD Telugu)) $(TR $(TD Glagolitic) $(TD Miao) $(TD Thaana)) $(TR $(TD Gothic) $(TD Mongolian) $(TD Thai)) $(TR $(TD Greek) $(TD Myanmar) $(TD Tibetan)) $(TR $(TD Gujarati) $(TD New_Tai_Lue) $(TD Tifinagh)) $(TR $(TD Gurmukhi) $(TD Nko) $(TD Ugaritic)) $(TR $(TD Han) $(TD Ogham) $(TD Vai)) $(TR $(TD Hangul) $(TD Ol_Chiki) $(TD Yi)) ) $(P Below is the table of names accepted by $(LREF unicode.hangulSyllableType).) $(BOOKTABLE $(B Hangul syllable type), $(TR $(TH Abb.) $(TH Long form)) $(TR $(TD L) $(TD Leading_Jamo)) $(TR $(TD LV) $(TD LV_Syllable)) $(TR $(TD LVT) $(TD LVT_Syllable) ) $(TR $(TD T) $(TD Trailing_Jamo)) $(TR $(TD V) $(TD Vowel_Jamo)) ) References: $(HTTP www.digitalmars.com/d/ascii-table.html, ASCII Table), $(HTTP en.wikipedia.org/wiki/Unicode, Wikipedia), $(HTTP www.unicode.org, The Unicode Consortium), $(HTTP www.unicode.org/reports/tr15/, Unicode normalization forms), $(HTTP www.unicode.org/reports/tr29/, Unicode text segmentation) $(HTTP www.unicode.org/uni2book/ch05.pdf, Unicode Implementation Guidelines) $(HTTP www.unicode.org/uni2book/ch03.pdf, Unicode Conformance) Trademarks: Unicode(tm) is a trademark of Unicode, Inc. Copyright: Copyright 2013 - License: $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0). Authors: Dmitry Olshansky Source: $(PHOBOSSRC std/uni/package.d) Standards: $(HTTP www.unicode.org/versions/Unicode6.2.0/, Unicode v6.2) Macros: SECTION =
0)
alias GetBitSlicing =
AliasSeq!(sliceBits!(top - sizes[0], top),
GetBitSlicing!(top - sizes[0], sizes[1..$]));
else
alias GetBitSlicing = AliasSeq!();
}
template callableWith(T)
{
template callableWith(alias Pred)
{
static if (!is(typeof(Pred(T.init))))
enum callableWith = false;
else
{
alias Result = typeof(Pred(T.init));
enum callableWith = isBitPackableType!(TypeOfBitPacked!(Result));
}
}
}
/*
Check if `Prefix` is a valid set of predicates
for `Trie` template having `Key` as the type of keys.
This requires all predicates to be callable, take
single argument of type `Key` and return unsigned value.
*/
template isValidPrefixForTrie(Key, Prefix...)
{
import std.meta : allSatisfy;
enum isValidPrefixForTrie = allSatisfy!(callableWith!Key, Prefix); // TODO: tighten the screws
}
/*
Check if `Args` is a set of maximum key value followed by valid predicates
for `Trie` template having `Key` as the type of keys.
*/
template isValidArgsForTrie(Key, Args...)
{
static if (Args.length > 1)
{
enum isValidArgsForTrie = isValidPrefixForTrie!(Key, Args)
|| (isValidPrefixForTrie!(Key, Args[1..$]) && is(typeof(Args[0]) : Key));
}
else
enum isValidArgsForTrie = isValidPrefixForTrie!Args;
}
@property size_t sumOfIntegerTuple(ints...)()
{
size_t count=0;
foreach (v; ints)
count += v;
return count;
}
/**
A shorthand for creating a custom multi-level fixed Trie
from a `CodepointSet`. `sizes` are numbers of bits per level,
with the most significant bits used first.
Note: The sum of `sizes` must be equal 21.
See_Also: $(LREF toTrie), which is even simpler.
Example:
---
{
import std.stdio;
auto set = unicode("Number");
auto trie = codepointSetTrie!(8, 5, 8)(set);
writeln("Input code points to test:");
foreach (line; stdin.byLine)
{
int count=0;
foreach (dchar ch; line)
if (trie[ch])// is number
count++;
writefln("Contains %d number code points.", count);
}
}
---
*/
public template codepointSetTrie(sizes...)
if (sumOfIntegerTuple!sizes == 21)
{
auto codepointSetTrie(Set)(Set set)
if (isCodepointSet!Set)
{
auto builder = TrieBuilder!(bool, dchar, lastDchar+1, GetBitSlicing!(21, sizes))(false);
foreach (ival; set.byInterval)
builder.putRange(ival[0], ival[1], true);
return builder.build();
}
}
/// Type of Trie generated by codepointSetTrie function.
public template CodepointSetTrie(sizes...)
if (sumOfIntegerTuple!sizes == 21)
{
alias Prefix = GetBitSlicing!(21, sizes);
alias CodepointSetTrie = typeof(TrieBuilder!(bool, dchar, lastDchar+1, Prefix)(false).build());
}
/**
A slightly more general tool for building fixed `Trie`
for the Unicode data.
Specifically unlike `codepointSetTrie` it's allows creating mappings
of `dchar` to an arbitrary type `T`.
Note: Overload taking `CodepointSet`s will naturally convert
only to bool mapping `Trie`s.
CodepointTrie is the type of Trie as generated by codepointTrie function.
*/
public template codepointTrie(T, sizes...)
if (sumOfIntegerTuple!sizes == 21)
{
alias Prefix = GetBitSlicing!(21, sizes);
static if (is(TypeOfBitPacked!T == bool))
{
auto codepointTrie(Set)(const scope Set set)
if (isCodepointSet!Set)
{
return codepointSetTrie(set);
}
}
///
auto codepointTrie()(T[dchar] map, T defValue=T.init)
{
return buildTrie!(T, dchar, Prefix)(map, defValue);
}
// unsorted range of pairs
///
auto codepointTrie(R)(R range, T defValue=T.init)
if (isInputRange!R
&& is(typeof(ElementType!R.init[0]) : T)
&& is(typeof(ElementType!R.init[1]) : dchar))
{
// build from unsorted array of pairs
// TODO: expose index sorting functions for Trie
return buildTrie!(T, dchar, Prefix)(range, defValue, true);
}
}
@system pure unittest
{
import std.algorithm.comparison : max;
import std.algorithm.searching : count;
// pick characters from the Greek script
auto set = unicode.Greek;
// a user-defined property (or an expensive function)
// that we want to look up
static uint luckFactor(dchar ch)
{
// here we consider a character lucky
// if its code point has a lot of identical hex-digits
// e.g. arabic letter DDAL (\u0688) has a "luck factor" of 2
ubyte[6] nibbles; // 6 4-bit chunks of code point
uint value = ch;
foreach (i; 0 .. 6)
{
nibbles[i] = value & 0xF;
value >>= 4;
}
uint luck;
foreach (n; nibbles)
luck = cast(uint) max(luck, count(nibbles[], n));
return luck;
}
// only unsigned built-ins are supported at the moment
alias LuckFactor = BitPacked!(uint, 3);
// create a temporary associative array (AA)
LuckFactor[dchar] map;
foreach (ch; set.byCodepoint)
map[ch] = LuckFactor(luckFactor(ch));
// bits per stage are chosen randomly, fell free to optimize
auto trie = codepointTrie!(LuckFactor, 8, 5, 8)(map);
// from now on the AA is not needed
foreach (ch; set.byCodepoint)
assert(trie[ch] == luckFactor(ch)); // verify
// CJK is not Greek, thus it has the default value
assert(trie['\u4444'] == 0);
// and here is a couple of quite lucky Greek characters:
// Greek small letter epsilon with dasia
assert(trie['\u1F11'] == 3);
// Ancient Greek metretes sign
assert(trie['\U00010181'] == 3);
}
/// ditto
public template CodepointTrie(T, sizes...)
if (sumOfIntegerTuple!sizes == 21)
{
alias Prefix = GetBitSlicing!(21, sizes);
alias CodepointTrie = typeof(TrieBuilder!(T, dchar, lastDchar+1, Prefix)(T.init).build());
}
package(std) template cmpK0(alias Pred)
{
import std.typecons : Tuple;
static bool cmpK0(Value, Key)
(Tuple!(Value, Key) a, Tuple!(Value, Key) b)
{
return Pred(a[1]) < Pred(b[1]);
}
}
/**
The most general utility for construction of `Trie`s
short of using `TrieBuilder` directly.
Provides a number of convenience overloads.
`Args` is tuple of maximum key value followed by
predicates to construct index from key.
Alternatively if the first argument is not a value convertible to `Key`
then the whole tuple of `Args` is treated as predicates
and the maximum Key is deduced from predicates.
*/
private template buildTrie(Value, Key, Args...)
if (isValidArgsForTrie!(Key, Args))
{
static if (is(typeof(Args[0]) : Key)) // prefix starts with upper bound on Key
{
alias Prefix = Args[1..$];
}
else
alias Prefix = Args;
alias getIndex = mapTrieIndex!(Prefix);
// for multi-sort
template GetComparators(size_t n)
{
static if (n > 0)
alias GetComparators =
AliasSeq!(GetComparators!(n-1), cmpK0!(Prefix[n-1]));
else
alias GetComparators = AliasSeq!();
}
/*
Build `Trie` from a range of a Key-Value pairs,
assuming it is sorted by Key as defined by the following lambda:
------
(a, b) => mapTrieIndex!(Prefix)(a) < mapTrieIndex!(Prefix)(b)
------
Exception is thrown if it's detected that the above order doesn't hold.
In other words $(LREF mapTrieIndex) should be a
monotonically increasing function that maps `Key` to an integer.
See_Also: $(REF sort, std,_algorithm),
$(REF SortedRange, std,range),
$(REF setUnion, std,_algorithm).
*/
auto buildTrie(Range)(Range range, Value filler=Value.init)
if (isInputRange!Range && is(typeof(Range.init.front[0]) : Value)
&& is(typeof(Range.init.front[1]) : Key))
{
auto builder = TrieBuilder!(Value, Key, Prefix)(filler);
foreach (v; range)
builder.putValue(v[1], v[0]);
return builder.build();
}
/*
If `Value` is bool (or BitPacked!(bool, x)) then it's possible
to build `Trie` from a range of open-right intervals of `Key`s.
The requirement on the ordering of keys (and the behavior on the
violation of it) is the same as for Key-Value range overload.
Intervals denote ranges of !`filler` i.e. the opposite of filler.
If no filler provided keys inside of the intervals map to true,
and `filler` is false.
*/
auto buildTrie(Range)(Range range, Value filler=Value.init)
if (is(TypeOfBitPacked!Value == bool)
&& isInputRange!Range && is(typeof(Range.init.front[0]) : Key)
&& is(typeof(Range.init.front[1]) : Key))
{
auto builder = TrieBuilder!(Value, Key, Prefix)(filler);
foreach (ival; range)
builder.putRange(ival[0], ival[1], !filler);
return builder.build();
}
auto buildTrie(Range)(Range range, Value filler, bool unsorted)
if (isInputRange!Range
&& is(typeof(Range.init.front[0]) : Value)
&& is(typeof(Range.init.front[1]) : Key))
{
import std.algorithm.sorting : multiSort;
alias Comps = GetComparators!(Prefix.length);
if (unsorted)
multiSort!(Comps)(range);
return buildTrie(range, filler);
}
/*
If `Value` is bool (or BitPacked!(bool, x)) then it's possible
to build `Trie` simply from an input range of `Key`s.
The requirement on the ordering of keys (and the behavior on the
violation of it) is the same as for Key-Value range overload.
Keys found in range denote !`filler` i.e. the opposite of filler.
If no filler provided keys map to true, and `filler` is false.
*/
auto buildTrie(Range)(Range range, Value filler=Value.init)
if (is(TypeOfBitPacked!Value == bool)
&& isInputRange!Range && is(typeof(Range.init.front) : Key))
{
auto builder = TrieBuilder!(Value, Key, Prefix)(filler);
foreach (v; range)
builder.putValue(v, !filler);
return builder.build();
}
/*
If `Key` is unsigned integer `Trie` could be constructed from array
of values where array index serves as key.
*/
auto buildTrie()(Value[] array, Value filler=Value.init)
if (isUnsigned!Key)
{
auto builder = TrieBuilder!(Value, Key, Prefix)(filler);
foreach (idx, v; array)
builder.putValue(idx, v);
return builder.build();
}
/*
Builds `Trie` from associative array.
*/
auto buildTrie(Key, Value)(Value[Key] map, Value filler=Value.init)
{
import std.array : array;
import std.range : zip;
auto range = array(zip(map.values, map.keys));
return buildTrie(range, filler, true); // sort it
}
}
// helper in place of assumeSize to
//reduce mangled name & help DMD inline Trie functors
struct clamp(size_t bits)
{
static size_t opCall(T)(T arg){ return arg; }
enum bitSize = bits;
}
struct clampIdx(size_t idx, size_t bits)
{
static size_t opCall(T)(T arg){ return arg[idx]; }
enum bitSize = bits;
}
/**
Conceptual type that outlines the common properties of all UTF Matchers.
Note: For illustration purposes only, every method
call results in assertion failure.
Use $(LREF utfMatcher) to obtain a concrete matcher
for UTF-8 or UTF-16 encodings.
*/
public struct MatcherConcept
{
/**
$(P Perform a semantic equivalent 2 operations:
decoding a $(CODEPOINT) at front of `inp` and testing if
it belongs to the set of $(CODEPOINTS) of this matcher. )
$(P The effect on `inp` depends on the kind of function called:)
$(P Match. If the codepoint is found in the set then range `inp`
is advanced by its size in $(S_LINK Code unit, code units),
otherwise the range is not modifed.)
$(P Skip. The range is always advanced by the size
of the tested $(CODEPOINT) regardless of the result of test.)
$(P Test. The range is left unaffected regardless
of the result of test.)
*/
public bool match(Range)(ref Range inp)
if (isRandomAccessRange!Range && is(ElementType!Range : char))
{
assert(false);
}
///ditto
public bool skip(Range)(ref Range inp)
if (isRandomAccessRange!Range && is(ElementType!Range : char))
{
assert(false);
}
///ditto
public bool test(Range)(ref Range inp)
if (isRandomAccessRange!Range && is(ElementType!Range : char))
{
assert(false);
}
///
pure @safe unittest
{
string truth = "2² = 4";
auto m = utfMatcher!char(unicode.Number);
assert(m.match(truth)); // '2' is a number all right
assert(truth == "² = 4"); // skips on match
assert(m.match(truth)); // so is the superscript '2'
assert(!m.match(truth)); // space is not a number
assert(truth == " = 4"); // unaffected on no match
assert(!m.skip(truth)); // same test ...
assert(truth == "= 4"); // but skips a codepoint regardless
assert(!m.test(truth)); // '=' is not a number
assert(truth == "= 4"); // test never affects argument
}
/**
Advanced feature - provide direct access to a subset of matcher based a
set of known encoding lengths. Lengths are provided in
$(S_LINK Code unit, code units). The sub-matcher then may do less
operations per any `test`/`match`.
Use with care as the sub-matcher won't match
any $(CODEPOINTS) that have encoded length that doesn't belong
to the selected set of lengths. Also the sub-matcher object references
the parent matcher and must not be used past the liftetime
of the latter.
Another caveat of using sub-matcher is that skip is not available
preciesly because sub-matcher doesn't detect all lengths.
*/
@property auto subMatcher(Lengths...)()
{
assert(0);
return this;
}
pure @safe unittest
{
auto m = utfMatcher!char(unicode.Number);
string square = "2²";
// about sub-matchers
assert(!m.subMatcher!(2,3,4).test(square)); // ASCII no covered
assert(m.subMatcher!1.match(square)); // ASCII-only, works
assert(!m.subMatcher!1.test(square)); // unicode '²'
assert(m.subMatcher!(2,3,4).match(square)); //
assert(square == "");
wstring wsquare = "2²";
auto m16 = utfMatcher!wchar(unicode.Number);
// may keep ref, but the orignal (m16) must be kept alive
auto bmp = m16.subMatcher!1;
assert(bmp.match(wsquare)); // Okay, in basic multilingual plan
assert(bmp.match(wsquare)); // And '²' too
}
}
/**
Test if `M` is an UTF Matcher for ranges of `Char`.
*/
public enum isUtfMatcher(M, C) = __traits(compiles, (){
C[] s;
auto d = s.decoder;
M m;
assert(is(typeof(m.match(d)) == bool));
assert(is(typeof(m.test(d)) == bool));
static if (is(typeof(m.skip(d))))
{
assert(is(typeof(m.skip(d)) == bool));
assert(is(typeof(m.skip(s)) == bool));
}
assert(is(typeof(m.match(s)) == bool));
assert(is(typeof(m.test(s)) == bool));
});
pure @safe unittest
{
alias CharMatcher = typeof(utfMatcher!char(CodepointSet.init));
alias WcharMatcher = typeof(utfMatcher!wchar(CodepointSet.init));
static assert(isUtfMatcher!(CharMatcher, char));
static assert(isUtfMatcher!(CharMatcher, immutable(char)));
static assert(isUtfMatcher!(WcharMatcher, wchar));
static assert(isUtfMatcher!(WcharMatcher, immutable(wchar)));
}
enum Mode {
alwaysSkip,
neverSkip,
skipOnMatch
}
mixin template ForwardStrings()
{
private bool fwdStr(string fn, C)(ref C[] str) const @trusted
{
import std.utf : byCodeUnit;
alias type = typeof(byCodeUnit(str));
return mixin(fn~"(*cast(type*)&str)");
}
}
template Utf8Matcher()
{
enum validSize(int sz) = sz >= 1 && sz <= 4;
void badEncoding() pure @safe
{
import std.utf : UTFException;
throw new UTFException("Invalid UTF-8 sequence");
}
//for 1-stage ASCII
alias AsciiSpec = AliasSeq!(bool, char, clamp!7);
//for 2-stage lookup of 2 byte UTF-8 sequences
alias Utf8Spec2 = AliasSeq!(bool, char[2],
clampIdx!(0, 5), clampIdx!(1, 6));
//ditto for 3 byte
alias Utf8Spec3 = AliasSeq!(bool, char[3],
clampIdx!(0, 4),
clampIdx!(1, 6),
clampIdx!(2, 6)
);
//ditto for 4 byte
alias Utf8Spec4 = AliasSeq!(bool, char[4],
clampIdx!(0, 3), clampIdx!(1, 6),
clampIdx!(2, 6), clampIdx!(3, 6)
);
alias Tables = AliasSeq!(
typeof(TrieBuilder!(AsciiSpec)(false).build()),
typeof(TrieBuilder!(Utf8Spec2)(false).build()),
typeof(TrieBuilder!(Utf8Spec3)(false).build()),
typeof(TrieBuilder!(Utf8Spec4)(false).build())
);
alias Table(int size) = Tables[size-1];
enum leadMask(size_t size) = (cast(size_t) 1<<(7 - size))-1;
enum encMask(size_t size) = ((1 << size)-1)<<(8-size);
char truncate()(char ch) pure @safe
{
ch -= 0x80;
if (ch < 0x40)
{
return ch;
}
else
{
badEncoding();
return cast(char) 0;
}
}
static auto encode(size_t sz)(dchar ch)
if (sz > 1)
{
import std.utf : encodeUTF = encode;
char[4] buf;
encodeUTF(buf, ch);
char[sz] ret;
buf[0] &= leadMask!sz;
foreach (n; 1 .. sz)
buf[n] = buf[n] & 0x3f; //keep 6 lower bits
ret[] = buf[0 .. sz];
return ret;
}
auto build(Set)(Set set)
{
import std.algorithm.iteration : map;
auto ascii = set & unicode.ASCII;
auto utf8_2 = set & CodepointSet(0x80, 0x800);
auto utf8_3 = set & CodepointSet(0x800, 0x1_0000);
auto utf8_4 = set & CodepointSet(0x1_0000, lastDchar+1);
auto asciiT = ascii.byCodepoint.map!(x=>cast(char) x).buildTrie!(AsciiSpec);
auto utf8_2T = utf8_2.byCodepoint.map!(x=>encode!2(x)).buildTrie!(Utf8Spec2);
auto utf8_3T = utf8_3.byCodepoint.map!(x=>encode!3(x)).buildTrie!(Utf8Spec3);
auto utf8_4T = utf8_4.byCodepoint.map!(x=>encode!4(x)).buildTrie!(Utf8Spec4);
alias Ret = Impl!(1,2,3,4);
return Ret(asciiT, utf8_2T, utf8_3T, utf8_4T);
}
// Bootstrap UTF-8 static matcher interface
// from 3 primitives: tab!(size), lookup and Sizes
mixin template DefMatcher()
{
import std.format : format;
import std.meta : Erase, staticIndexOf;
enum hasASCII = staticIndexOf!(1, Sizes) >= 0;
alias UniSizes = Erase!(1, Sizes);
//generate dispatch code sequence for unicode parts
static auto genDispatch()
{
string code;
foreach (size; UniSizes)
code ~= format(q{
if ((ch & ~leadMask!%d) == encMask!(%d))
return lookup!(%d, mode)(inp);
else
}, size, size, size);
static if (Sizes.length == 4) //covers all code unit cases
code ~= "{ badEncoding(); return false; }";
else
code ~= "return false;"; //may be just fine but not covered
return code;
}
enum dispatch = genDispatch();
public bool match(Range)(ref Range inp) const
if (isRandomAccessRange!Range && is(ElementType!Range : char) &&
!isDynamicArray!Range)
{
enum mode = Mode.skipOnMatch;
assert(!inp.empty);
immutable ch = inp[0];
static if (hasASCII)
{
if (ch < 0x80)
{
immutable r = tab!1[ch];
if (r)
inp.popFront();
return r;
}
else
mixin(dispatch);
}
else
mixin(dispatch);
}
static if (Sizes.length == 4) // can skip iff can detect all encodings
{
public bool skip(Range)(ref Range inp) const
if (isRandomAccessRange!Range && is(ElementType!Range : char) &&
!isDynamicArray!Range)
{
enum mode = Mode.alwaysSkip;
assert(!inp.empty);
auto ch = inp[0];
static if (hasASCII)
{
if (ch < 0x80)
{
inp.popFront();
return tab!1[ch];
}
else
mixin(dispatch);
}
else
mixin(dispatch);
}
}
public bool test(Range)(ref Range inp) const
if (isRandomAccessRange!Range && is(ElementType!Range : char) &&
!isDynamicArray!Range)
{
enum mode = Mode.neverSkip;
assert(!inp.empty);
auto ch = inp[0];
static if (hasASCII)
{
if (ch < 0x80)
return tab!1[ch];
else
mixin(dispatch);
}
else
mixin(dispatch);
}
bool match(C)(ref C[] str) const
if (isSomeChar!C)
{
return fwdStr!"match"(str);
}
bool skip(C)(ref C[] str) const
if (isSomeChar!C)
{
return fwdStr!"skip"(str);
}
bool test(C)(ref C[] str) const
if (isSomeChar!C)
{
return fwdStr!"test"(str);
}
mixin ForwardStrings;
}
struct Impl(Sizes...)
{
import std.meta : allSatisfy, staticMap;
static assert(allSatisfy!(validSize, Sizes),
"Only lengths of 1, 2, 3 and 4 code unit are possible for UTF-8");
private:
//pick tables for chosen sizes
alias OurTabs = staticMap!(Table, Sizes);
OurTabs tables;
mixin DefMatcher;
//static disptach helper UTF size ==> table
alias tab(int i) = tables[i - 1];
package(std) @property CherryPick!(Impl, SizesToPick) subMatcher(SizesToPick...)()
{
return CherryPick!(Impl, SizesToPick)(&this);
}
bool lookup(int size, Mode mode, Range)(ref Range inp) const
{
import std.range : popFrontN;
if (inp.length < size)
{
badEncoding();
return false;
}
char[size] needle = void;
needle[0] = leadMask!size & inp[0];
static foreach (i; 1 .. size)
{
needle[i] = truncate(inp[i]);
}
//overlong encoding checks
static if (size == 2)
{
//0x80-0x7FF
//got 6 bits in needle[1], must use at least 8 bits
//must use at least 2 bits in needle[1]
if (needle[0] < 2) badEncoding();
}
else static if (size == 3)
{
//0x800-0xFFFF
//got 6 bits in needle[2], must use at least 12bits
//must use 6 bits in needle[1] or anything in needle[0]
if (needle[0] == 0 && needle[1] < 0x20) badEncoding();
}
else static if (size == 4)
{
//0x800-0xFFFF
//got 2x6=12 bits in needle[2 .. 3] must use at least 17bits
//must use 5 bits (or above) in needle[1] or anything in needle[0]
if (needle[0] == 0 && needle[1] < 0x10) badEncoding();
}
static if (mode == Mode.alwaysSkip)
{
inp.popFrontN(size);
return tab!size[needle];
}
else static if (mode == Mode.neverSkip)
{
return tab!size[needle];
}
else
{
static assert(mode == Mode.skipOnMatch);
if (tab!size[needle])
{
inp.popFrontN(size);
return true;
}
else
return false;
}
}
}
struct CherryPick(I, Sizes...)
{
import std.meta : allSatisfy;
static assert(allSatisfy!(validSize, Sizes),
"Only lengths of 1, 2, 3 and 4 code unit are possible for UTF-8");
private:
I* m;
@property auto tab(int i)() const { return m.tables[i - 1]; }
bool lookup(int size, Mode mode, Range)(ref Range inp) const
{
return m.lookup!(size, mode)(inp);
}
mixin DefMatcher;
}
}
template Utf16Matcher()
{
enum validSize(int sz) = sz >= 1 && sz <= 2;
void badEncoding() pure @safe
{
import std.utf : UTFException;
throw new UTFException("Invalid UTF-16 sequence");
}
// 1-stage ASCII
alias AsciiSpec = AliasSeq!(bool, wchar, clamp!7);
//2-stage BMP
alias BmpSpec = AliasSeq!(bool, wchar, sliceBits!(7, 16), sliceBits!(0, 7));
//4-stage - full Unicode
//assume that 0xD800 & 0xDC00 bits are cleared
//thus leaving 10 bit per wchar to worry about
alias UniSpec = AliasSeq!(bool, wchar[2],
assumeSize!(x=>x[0]>>4, 6), assumeSize!(x=>x[0]&0xf, 4),
assumeSize!(x=>x[1]>>6, 4), assumeSize!(x=>x[1]&0x3f, 6),
);
alias Ascii = typeof(TrieBuilder!(AsciiSpec)(false).build());
alias Bmp = typeof(TrieBuilder!(BmpSpec)(false).build());
alias Uni = typeof(TrieBuilder!(UniSpec)(false).build());
auto encode2(dchar ch)
{
ch -= 0x1_0000;
assert(ch <= 0xF_FFFF);
wchar[2] ret;
//do not put surrogate bits, they are sliced off
ret[0] = cast(wchar)(ch >> 10);
ret[1] = (ch & 0xFFF);
return ret;
}
auto build(Set)(Set set)
{
import std.algorithm.iteration : map;
auto ascii = set & unicode.ASCII;
auto bmp = (set & CodepointSet.fromIntervals(0x80, 0xFFFF+1))
- CodepointSet.fromIntervals(0xD800, 0xDFFF+1);
auto other = set - (bmp | ascii);
auto asciiT = ascii.byCodepoint.map!(x=>cast(char) x).buildTrie!(AsciiSpec);
auto bmpT = bmp.byCodepoint.map!(x=>cast(wchar) x).buildTrie!(BmpSpec);
auto otherT = other.byCodepoint.map!(x=>encode2(x)).buildTrie!(UniSpec);
alias Ret = Impl!(1,2);
return Ret(asciiT, bmpT, otherT);
}
//bootstrap full UTF-16 matcher interace from
//sizeFlags, lookupUni and ascii
mixin template DefMatcher()
{
public bool match(Range)(ref Range inp) const
if (isRandomAccessRange!Range && is(ElementType!Range : wchar) &&
!isDynamicArray!Range)
{
enum mode = Mode.skipOnMatch;
assert(!inp.empty);
immutable ch = inp[0];
static if (sizeFlags & 1)
{
if (ch < 0x80)
{
if (ascii[ch])
{
inp.popFront();
return true;
}
else
return false;
}
return lookupUni!mode(inp);
}
else
return lookupUni!mode(inp);
}
static if (Sizes.length == 2)
{
public bool skip(Range)(ref Range inp) const
if (isRandomAccessRange!Range && is(ElementType!Range : wchar) &&
!isDynamicArray!Range)
{
enum mode = Mode.alwaysSkip;
assert(!inp.empty);
immutable ch = inp[0];
static if (sizeFlags & 1)
{
if (ch < 0x80)
{
inp.popFront();
return ascii[ch];
}
else
return lookupUni!mode(inp);
}
else
return lookupUni!mode(inp);
}
}
public bool test(Range)(ref Range inp) const
if (isRandomAccessRange!Range && is(ElementType!Range : wchar) &&
!isDynamicArray!Range)
{
enum mode = Mode.neverSkip;
assert(!inp.empty);
auto ch = inp[0];
static if (sizeFlags & 1)
return ch < 0x80 ? ascii[ch] : lookupUni!mode(inp);
else
return lookupUni!mode(inp);
}
bool match(C)(ref C[] str) const
if (isSomeChar!C)
{
return fwdStr!"match"(str);
}
bool skip(C)(ref C[] str) const
if (isSomeChar!C)
{
return fwdStr!"skip"(str);
}
bool test(C)(ref C[] str) const
if (isSomeChar!C)
{
return fwdStr!"test"(str);
}
mixin ForwardStrings; //dispatch strings to range versions
}
struct Impl(Sizes...)
if (Sizes.length >= 1 && Sizes.length <= 2)
{
private:
import std.meta : allSatisfy;
static assert(allSatisfy!(validSize, Sizes),
"Only lengths of 1 and 2 code units are possible in UTF-16");
static if (Sizes.length > 1)
enum sizeFlags = Sizes[0] | Sizes[1];
else
enum sizeFlags = Sizes[0];
static if (sizeFlags & 1)
{
Ascii ascii;
Bmp bmp;
}
static if (sizeFlags & 2)
{
Uni uni;
}
mixin DefMatcher;
package(std) @property CherryPick!(Impl, SizesToPick) subMatcher(SizesToPick...)()
{
return CherryPick!(Impl, SizesToPick)(&this);
}
bool lookupUni(Mode mode, Range)(ref Range inp) const
{
wchar x = cast(wchar)(inp[0] - 0xD800);
//not a high surrogate
if (x > 0x3FF)
{
//low surrogate
if (x <= 0x7FF) badEncoding();
static if (sizeFlags & 1)
{
auto ch = inp[0];
static if (mode == Mode.alwaysSkip)
inp.popFront();
static if (mode == Mode.skipOnMatch)
{
if (bmp[ch])
{
inp.popFront();
return true;
}
else
return false;
}
else
return bmp[ch];
}
else //skip is not available for sub-matchers, so just false
return false;
}
else
{
import std.range : popFrontN;
static if (sizeFlags & 2)
{
if (inp.length < 2)
badEncoding();
wchar y = cast(wchar)(inp[1] - 0xDC00);
//not a low surrogate
if (y > 0x3FF)
badEncoding();
wchar[2] needle = [inp[0] & 0x3ff, inp[1] & 0x3ff];
static if (mode == Mode.alwaysSkip)
inp.popFrontN(2);
static if (mode == Mode.skipOnMatch)
{
if (uni[needle])
{
inp.popFrontN(2);
return true;
}
else
return false;
}
else
return uni[needle];
}
else //ditto
return false;
}
}
}
struct CherryPick(I, Sizes...)
if (Sizes.length >= 1 && Sizes.length <= 2)
{
private:
import std.meta : allSatisfy;
I* m;
enum sizeFlags = I.sizeFlags;
static if (sizeFlags & 1)
{
@property auto ascii()() const { return m.ascii; }
}
bool lookupUni(Mode mode, Range)(ref Range inp) const
{
return m.lookupUni!mode(inp);
}
mixin DefMatcher;
static assert(allSatisfy!(validSize, Sizes),
"Only lengths of 1 and 2 code units are possible in UTF-16");
}
}
private auto utf8Matcher(Set)(Set set)
{
return Utf8Matcher!().build(set);
}
private auto utf16Matcher(Set)(Set set)
{
return Utf16Matcher!().build(set);
}
/**
Constructs a matcher object
to classify $(CODEPOINTS) from the `set` for encoding
that has `Char` as code unit.
See $(LREF MatcherConcept) for API outline.
*/
public auto utfMatcher(Char, Set)(Set set)
if (isCodepointSet!Set)
{
static if (is(Char : char))
return utf8Matcher(set);
else static if (is(Char : wchar))
return utf16Matcher(set);
else static if (is(Char : dchar))
static assert(false, "UTF-32 needs no decoding,
and thus not supported by utfMatcher");
else
static assert(false, "Only character types 'char' and 'wchar' are allowed");
}
//a range of code units, packed with index to speed up forward iteration
package(std) auto decoder(C)(C[] s, size_t offset=0)
if (is(C : wchar) || is(C : char))
{
static struct Decoder
{
pure nothrow:
C[] str;
size_t idx;
@property C front(){ return str[idx]; }
@property C back(){ return str[$-1]; }
void popFront(){ idx++; }
void popBack(){ str = str[0..$-1]; }
void popFrontN(size_t n){ idx += n; }
@property bool empty(){ return idx == str.length; }
@property auto save(){ return this; }
auto opIndex(size_t i){ return str[idx+i]; }
@property size_t length(){ return str.length - idx; }
alias opDollar = length;
auto opSlice(size_t a, size_t b){ return Decoder(str[0 .. idx+b], idx+a); }
}
static assert(isRandomAccessRange!Decoder);
static assert(is(ElementType!Decoder : C));
return Decoder(s, offset);
}
pure @safe unittest
{
string rs = "hi! ネемног砀 текста";
auto codec = rs.decoder;
auto utf8 = utf8Matcher(unicode.Letter);
auto asc = utf8.subMatcher!(1);
auto uni = utf8.subMatcher!(2,3,4);
assert(asc.test(codec));
assert(!uni.match(codec));
assert(utf8.skip(codec));
assert(codec.idx == 1);
assert(!uni.match(codec));
assert(asc.test(codec));
assert(utf8.skip(codec));
assert(codec.idx == 2);
assert(!asc.match(codec));
assert(!utf8.test(codec));
assert(!utf8.skip(codec));
assert(!asc.test(codec));
assert(!utf8.test(codec));
assert(!utf8.skip(codec));
assert(utf8.test(codec));
foreach (i; 0 .. 7)
{
assert(!asc.test(codec));
assert(uni.test(codec));
assert(utf8.skip(codec));
}
assert(!utf8.test(codec));
assert(!utf8.skip(codec));
//the same with match where applicable
codec = rs.decoder;
assert(utf8.match(codec));
assert(codec.idx == 1);
assert(utf8.match(codec));
assert(codec.idx == 2);
assert(!utf8.match(codec));
assert(codec.idx == 2);
assert(!utf8.skip(codec));
assert(!utf8.skip(codec));
foreach (i; 0 .. 7)
{
assert(!asc.test(codec));
assert(utf8.test(codec));
assert(utf8.match(codec));
}
auto i = codec.idx;
assert(!utf8.match(codec));
assert(codec.idx == i);
}
pure @safe unittest
{
import std.range : stride;
static bool testAll(Matcher, Range)(ref Matcher m, ref Range r) @safe
{
bool t = m.test(r);
auto save = r.idx;
assert(t == m.match(r));
assert(r.idx == save || t); //ether no change or was match
r.idx = save;
static if (is(typeof(m.skip(r))))
{
assert(t == m.skip(r));
assert(r.idx != save); //always changed
r.idx = save;
}
return t;
}
auto utf16 = utfMatcher!wchar(unicode.L);
auto bmp = utf16.subMatcher!1;
auto nonBmp = utf16.subMatcher!1;
auto utf8 = utfMatcher!char(unicode.L);
auto ascii = utf8.subMatcher!1;
auto uni2 = utf8.subMatcher!2;
auto uni3 = utf8.subMatcher!3;
auto uni24 = utf8.subMatcher!(2,4);
foreach (ch; unicode.L.byCodepoint.stride(3))
{
import std.utf : encode;
char[4] buf;
wchar[2] buf16;
auto len = encode(buf, ch);
auto len16 = encode(buf16, ch);
auto c8 = buf[0 .. len].decoder;
auto c16 = buf16[0 .. len16].decoder;
assert(testAll(utf16, c16));
assert(testAll(bmp, c16) || len16 != 1);
assert(testAll(nonBmp, c16) || len16 != 2);
assert(testAll(utf8, c8));
//submatchers return false on out of their domain
assert(testAll(ascii, c8) || len != 1);
assert(testAll(uni2, c8) || len != 2);
assert(testAll(uni3, c8) || len != 3);
assert(testAll(uni24, c8) || (len != 2 && len != 4));
}
}
// cover decode fail cases of Matcher
pure @safe unittest
{
import std.algorithm.iteration : map;
import std.exception : collectException;
import std.format : format;
auto utf16 = utfMatcher!wchar(unicode.L);
auto utf8 = utfMatcher!char(unicode.L);
//decode failure cases UTF-8
alias fails8 = AliasSeq!("\xC1", "\x80\x00","\xC0\x00", "\xCF\x79",
"\xFF\x00\0x00\0x00\x00", "\xC0\0x80\0x80\x80", "\x80\0x00\0x00\x00",
"\xCF\x00\0x00\0x00\x00");
foreach (msg; fails8)
{
assert(collectException((){
auto s = msg;
size_t idx = 0;
utf8.test(s);
}()), format("%( %2x %)", cast(immutable(ubyte)[]) msg));
}
//decode failure cases UTF-16
alias fails16 = AliasSeq!([0xD811], [0xDC02]);
foreach (msg; fails16)
{
assert(collectException((){
auto s = msg.map!(x => cast(wchar) x);
utf16.test(s);
}()));
}
}
/++
Convenience function to construct optimal configurations for
packed Trie from any `set` of $(CODEPOINTS).
The parameter `level` indicates the number of trie levels to use,
allowed values are: 1, 2, 3 or 4. Levels represent different trade-offs
speed-size wise.
$(P Level 1 is fastest and the most memory hungry (a bit array). )
$(P Level 4 is the slowest and has the smallest footprint. )
See the $(S_LINK Synopsis, Synopsis) section for example.
Note:
Level 4 stays very practical (being faster and more predictable)
compared to using direct lookup on the `set` itself.
+/
public auto toTrie(size_t level, Set)(Set set)
if (isCodepointSet!Set)
{
static if (level == 1)
return codepointSetTrie!(21)(set);
else static if (level == 2)
return codepointSetTrie!(10, 11)(set);
else static if (level == 3)
return codepointSetTrie!(8, 5, 8)(set);
else static if (level == 4)
return codepointSetTrie!(6, 4, 4, 7)(set);
else
static assert(false,
"Sorry, toTrie doesn't support levels > 4, use codepointSetTrie directly");
}
/**
$(P Builds a `Trie` with typically optimal speed-size trade-off
and wraps it into a delegate of the following type:
$(D bool delegate(dchar ch)). )
$(P Effectively this creates a 'tester' lambda suitable
for algorithms like std.algorithm.find that take unary predicates. )
See the $(S_LINK Synopsis, Synopsis) section for example.
*/
public auto toDelegate(Set)(Set set)
if (isCodepointSet!Set)
{
// 3 is very small and is almost as fast as 2-level (due to CPU caches?)
auto t = toTrie!3(set);
return (dchar ch) => t[ch];
}
/**
$(P Opaque wrapper around unsigned built-in integers and
code unit (char/wchar/dchar) types.
Parameter `sz` indicates that the value is confined
to the range of [0, 2^^sz$(RPAREN). With this knowledge it can be
packed more tightly when stored in certain
data-structures like trie. )
Note:
$(P The $(D BitPacked!(T, sz)) is implicitly convertible to `T`
but not vise-versa. Users have to ensure the value fits in
the range required and use the `cast`
operator to perform the conversion.)
*/
struct BitPacked(T, size_t sz)
if (isIntegral!T || is(T:dchar))
{
enum bitSize = sz;
T _value;
alias _value this;
}
/*
Depending on the form of the passed argument `bitSizeOf` returns
the amount of bits required to represent a given type
or a return type of a given functor.
*/
template bitSizeOf(Args...)
if (Args.length == 1)
{
import std.traits : ReturnType;
alias T = Args[0];
static if (__traits(compiles, { size_t val = T.bitSize; })) //(is(typeof(T.bitSize) : size_t))
{
enum bitSizeOf = T.bitSize;
}
else static if (is(ReturnType!T dummy == BitPacked!(U, bits), U, size_t bits))
{
enum bitSizeOf = bitSizeOf!(ReturnType!T);
}
else
{
enum bitSizeOf = T.sizeof*8;
}
}
/**
Tests if `T` is some instantiation of $(LREF BitPacked)!(U, x)
and thus suitable for packing.
*/
template isBitPacked(T)
{
static if (is(T dummy == BitPacked!(U, bits), U, size_t bits))
enum isBitPacked = true;
else
enum isBitPacked = false;
}
/**
Gives the type `U` from $(LREF BitPacked)!(U, x)
or `T` itself for every other type.
*/
template TypeOfBitPacked(T)
{
static if (is(T dummy == BitPacked!(U, bits), U, size_t bits))
alias TypeOfBitPacked = U;
else
alias TypeOfBitPacked = T;
}
/*
Wrapper, used in definition of custom data structures from `Trie` template.
Applying it to a unary lambda function indicates that the returned value always
fits within `bits` of bits.
*/
struct assumeSize(alias Fn, size_t bits)
{
enum bitSize = bits;
static auto ref opCall(T)(auto ref T arg)
{
return Fn(arg);
}
}
/*
A helper for defining lambda function that yields a slice
of certain bits from an unsigned integral value.
The resulting lambda is wrapped in assumeSize and can be used directly
with `Trie` template.
*/
struct sliceBits(size_t from, size_t to)
{
//for now bypass assumeSize, DMD has trouble inlining it
enum bitSize = to-from;
static auto opCall(T)(T x)
out(result)
{
assert(result < (1 << to-from));
}
do
{
static assert(from < to);
static if (from == 0)
return x & ((1 << to)-1);
else
return (x >> from) & ((1<<(to-from))-1);
}
}
@safe pure nothrow @nogc uint low_8(uint x) { return x&0xFF; }
@safe pure nothrow @nogc uint midlow_8(uint x){ return (x&0xFF00)>>8; }
alias lo8 = assumeSize!(low_8, 8);
alias mlo8 = assumeSize!(midlow_8, 8);
@safe pure nothrow @nogc unittest
{
static assert(bitSizeOf!lo8 == 8);
static assert(bitSizeOf!(sliceBits!(4, 7)) == 3);
static assert(bitSizeOf!(BitPacked!(uint, 2)) == 2);
}
template Sequence(size_t start, size_t end)
{
static if (start < end)
alias Sequence = AliasSeq!(start, Sequence!(start+1, end));
else
alias Sequence = AliasSeq!();
}
//---- TRIE TESTS ----
@system unittest
{
import std.algorithm.iteration : map;
import std.algorithm.sorting : sort;
import std.array : array;
import std.conv : text, to;
import std.range : iota;
static trieStats(TRIE)(TRIE t)
{
version (std_uni_stats)
{
import std.stdio : writefln, writeln;
writeln("---TRIE FOOTPRINT STATS---");
static foreach (i; 0 .. t.table.dim)
{
writefln("lvl%s = %s bytes; %s pages"
, i, t.bytes!i, t.pages!i);
}
writefln("TOTAL: %s bytes", t.bytes);
version (none)
{
writeln("INDEX (excluding value level):");
static foreach (i; 0 .. t.table.dim-1)
writeln(t.table.slice!(i)[0 .. t.table.length!i]);
}
writeln("---------------------------");
}
}
//@@@BUG link failure, lambdas not found by linker somehow (in case of trie2)
// alias lo8 = assumeSize!(8, function (uint x) { return x&0xFF; });
// alias next8 = assumeSize!(7, function (uint x) { return (x&0x7F00)>>8; });
alias Set = CodepointSet;
auto set = Set('A','Z','a','z');
auto trie = buildTrie!(bool, uint, 256, lo8)(set.byInterval);// simple bool array
for (int a='a'; a<'z';a++)
assert(trie[a]);
for (int a='A'; a<'Z';a++)
assert(trie[a]);
for (int a=0; a<'A'; a++)
assert(!trie[a]);
for (int a ='Z'; a<'a'; a++)
assert(!trie[a]);
trieStats(trie);
auto redundant2 = Set(
1, 18, 256+2, 256+111, 512+1, 512+18, 768+2, 768+111);
auto trie2 = buildTrie!(bool, uint, 1024, mlo8, lo8)(redundant2.byInterval);
trieStats(trie2);
foreach (e; redundant2.byCodepoint)
assert(trie2[e], text(cast(uint) e, " - ", trie2[e]));
foreach (i; 0 .. 1024)
{
assert(trie2[i] == (i in redundant2));
}
auto redundant3 = Set(
2, 4, 6, 8, 16,
2+16, 4+16, 16+6, 16+8, 16+16,
2+32, 4+32, 32+6, 32+8,
);
enum max3 = 256;
// sliceBits
auto trie3 = buildTrie!(bool, uint, max3,
sliceBits!(6,8), sliceBits!(4,6), sliceBits!(0,4)
)(redundant3.byInterval);
trieStats(trie3);
foreach (i; 0 .. max3)
assert(trie3[i] == (i in redundant3), text(cast(uint) i));
auto redundant4 = Set(
10, 64, 64+10, 128, 128+10, 256, 256+10, 512,
1000, 2000, 3000, 4000, 5000, 6000
);
enum max4 = 2^^16;
auto trie4 = buildTrie!(bool, size_t, max4,
sliceBits!(13, 16), sliceBits!(9, 13), sliceBits!(6, 9) , sliceBits!(0, 6)
)(redundant4.byInterval);
foreach (i; 0 .. max4)
{
if (i in redundant4)
assert(trie4[i], text(cast(uint) i));
}
trieStats(trie4);
alias mapToS = mapTrieIndex!(useItemAt!(0, char));
string[] redundantS = ["tea", "start", "orange"];
redundantS.sort!((a,b) => mapToS(a) < mapToS(b))();
auto strie = buildTrie!(bool, string, useItemAt!(0, char))(redundantS);
// using first char only
assert(redundantS == ["orange", "start", "tea"]);
assert(strie["test"], text(strie["test"]));
assert(!strie["aea"]);
assert(strie["s"]);
// a bit size test
auto a = array(map!(x => to!ubyte(x))(iota(0, 256)));
auto bt = buildTrie!(bool, ubyte, sliceBits!(7, 8), sliceBits!(5, 7), sliceBits!(0, 5))(a);
trieStats(bt);
foreach (i; 0 .. 256)
assert(bt[cast(ubyte) i]);
}
template useItemAt(size_t idx, T)
if (isIntegral!T || is(T: dchar))
{
size_t impl(const scope T[] arr){ return arr[idx]; }
alias useItemAt = assumeSize!(impl, 8*T.sizeof);
}
template useLastItem(T)
{
size_t impl(const scope T[] arr){ return arr[$-1]; }
alias useLastItem = assumeSize!(impl, 8*T.sizeof);
}
template fullBitSize(Prefix...)
{
static if (Prefix.length > 0)
enum fullBitSize = bitSizeOf!(Prefix[0])+fullBitSize!(Prefix[1..$]);
else
enum fullBitSize = 0;
}
template idxTypes(Key, size_t fullBits, Prefix...)
{
static if (Prefix.length == 1)
{// the last level is value level, so no index once reduced to 1-level
alias idxTypes = AliasSeq!();
}
else
{
// Important note on bit packing
// Each level has to hold enough of bits to address the next one
// The bottom level is known to hold full bit width
// thus it's size in pages is full_bit_width - size_of_last_prefix
// Recourse on this notion
alias idxTypes =
AliasSeq!(
idxTypes!(Key, fullBits - bitSizeOf!(Prefix[$-1]), Prefix[0..$-1]),
BitPacked!(typeof(Prefix[$-2](Key.init)), fullBits - bitSizeOf!(Prefix[$-1]))
);
}
}
//============================================================================
@safe pure int comparePropertyName(Char1, Char2)(const(Char1)[] a, const(Char2)[] b)
if (is(Char1 : dchar) && is(Char2 : dchar))
{
import std.algorithm.comparison : cmp;
import std.algorithm.iteration : map, filter;
import std.ascii : toLower;
static bool pred(dchar c) {return !c.isWhite && c != '-' && c != '_';}
return cmp(
a.map!toLower.filter!pred,
b.map!toLower.filter!pred);
}
@safe pure unittest
{
assert(!comparePropertyName("foo-bar", "fooBar"));
}
bool propertyNameLess(Char1, Char2)(const(Char1)[] a, const(Char2)[] b) @safe pure
if (is(Char1 : dchar) && is(Char2 : dchar))
{
return comparePropertyName(a, b) < 0;
}
//============================================================================
// Utilities for compression of Unicode code point sets
//============================================================================
@safe void compressTo(uint val, ref scope ubyte[] arr) pure nothrow
{
// not optimized as usually done 1 time (and not public interface)
if (val < 128)
arr ~= cast(ubyte) val;
else if (val < (1 << 13))
{
arr ~= (0b1_00 << 5) | cast(ubyte)(val >> 8);
arr ~= val & 0xFF;
}
else
{
assert(val < (1 << 21));
arr ~= (0b1_01 << 5) | cast(ubyte)(val >> 16);
arr ~= (val >> 8) & 0xFF;
arr ~= val & 0xFF;
}
}
@safe uint decompressFrom(scope const(ubyte)[] arr, ref size_t idx) pure
{
import std.exception : enforce;
immutable first = arr[idx++];
if (!(first & 0x80)) // no top bit -> [0 .. 127]
return first;
immutable extra = ((first >> 5) & 1) + 1; // [1, 2]
uint val = (first & 0x1F);
enforce(idx + extra <= arr.length, "bad code point interval encoding");
foreach (j; 0 .. extra)
val = (val << 8) | arr[idx+j];
idx += extra;
return val;
}
package(std) ubyte[] compressIntervals(Range)(Range intervals)
if (isInputRange!Range && isIntegralPair!(ElementType!Range))
{
ubyte[] storage;
uint base = 0;
// RLE encode
foreach (val; intervals)
{
compressTo(val[0]-base, storage);
base = val[0];
if (val[1] != lastDchar+1) // till the end of the domain so don't store it
{
compressTo(val[1]-base, storage);
base = val[1];
}
}
return storage;
}
@safe pure unittest
{
import std.algorithm.comparison : equal;
import std.typecons : tuple;
auto run = [tuple(80, 127), tuple(128, (1 << 10)+128)];
ubyte[] enc = [cast(ubyte) 80, 47, 1, (0b1_00 << 5) | (1 << 2), 0];
assert(compressIntervals(run) == enc);
auto run2 = [tuple(0, (1 << 20)+512+1), tuple((1 << 20)+512+4, lastDchar+1)];
ubyte[] enc2 = [cast(ubyte) 0, (0b1_01 << 5) | (1 << 4), 2, 1, 3]; // odd length-ed
assert(compressIntervals(run2) == enc2);
size_t idx = 0;
assert(decompressFrom(enc, idx) == 80);
assert(decompressFrom(enc, idx) == 47);
assert(decompressFrom(enc, idx) == 1);
assert(decompressFrom(enc, idx) == (1 << 10));
idx = 0;
assert(decompressFrom(enc2, idx) == 0);
assert(decompressFrom(enc2, idx) == (1 << 20)+512+1);
assert(equal(decompressIntervals(compressIntervals(run)), run));
assert(equal(decompressIntervals(compressIntervals(run2)), run2));
}
// Creates a range of `CodepointInterval` that lazily decodes compressed data.
@safe package(std) auto decompressIntervals(const(ubyte)[] data) pure
{
return DecompressedIntervals(data);
}
@safe struct DecompressedIntervals
{
pure:
const(ubyte)[] _stream;
size_t _idx;
CodepointInterval _front;
this(const(ubyte)[] stream)
{
_stream = stream;
popFront();
}
@property CodepointInterval front()
{
assert(!empty);
return _front;
}
void popFront()
{
if (_idx == _stream.length)
{
_idx = size_t.max;
return;
}
uint base = _front[1];
_front[0] = base + decompressFrom(_stream, _idx);
if (_idx == _stream.length)// odd length ---> till the end
_front[1] = lastDchar+1;
else
{
base = _front[0];
_front[1] = base + decompressFrom(_stream, _idx);
}
}
@property bool empty() const
{
return _idx == size_t.max;
}
@property DecompressedIntervals save() return scope { return this; }
}
@safe pure nothrow @nogc unittest
{
static assert(isInputRange!DecompressedIntervals);
static assert(isForwardRange!DecompressedIntervals);
}
//============================================================================
version (std_uni_bootstrap){}
else
{
// helper for looking up code point sets
ptrdiff_t findUnicodeSet(alias table, C)(const scope C[] name)
{
import std.algorithm.iteration : map;
import std.range : assumeSorted;
auto range = assumeSorted!((a,b) => propertyNameLess(a,b))
(table.map!"a.name"());
size_t idx = range.lowerBound(name).length;
if (idx < range.length && comparePropertyName(range[idx], name) == 0)
return idx;
return -1;
}
// another one that loads it
bool loadUnicodeSet(alias table, Set, C)(const scope C[] name, ref Set dest)
{
auto idx = findUnicodeSet!table(name);
if (idx >= 0)
{
dest = Set(asSet(table[idx].compressed));
return true;
}
return false;
}
bool loadProperty(Set=CodepointSet, C)
(const scope C[] name, ref Set target) pure
{
import std.internal.unicode_tables : uniProps; // generated file
alias ucmp = comparePropertyName;
// conjure cumulative properties by hand
if (ucmp(name, "L") == 0 || ucmp(name, "Letter") == 0)
{
target = asSet(uniProps.Lu);
target |= asSet(uniProps.Ll);
target |= asSet(uniProps.Lt);
target |= asSet(uniProps.Lo);
target |= asSet(uniProps.Lm);
}
else if (ucmp(name,"LC") == 0 || ucmp(name,"Cased Letter")==0)
{
target = asSet(uniProps.Ll);
target |= asSet(uniProps.Lu);
target |= asSet(uniProps.Lt);// Title case
}
else if (ucmp(name, "M") == 0 || ucmp(name, "Mark") == 0)
{
target = asSet(uniProps.Mn);
target |= asSet(uniProps.Mc);
target |= asSet(uniProps.Me);
}
else if (ucmp(name, "N") == 0 || ucmp(name, "Number") == 0)
{
target = asSet(uniProps.Nd);
target |= asSet(uniProps.Nl);
target |= asSet(uniProps.No);
}
else if (ucmp(name, "P") == 0 || ucmp(name, "Punctuation") == 0)
{
target = asSet(uniProps.Pc);
target |= asSet(uniProps.Pd);
target |= asSet(uniProps.Ps);
target |= asSet(uniProps.Pe);
target |= asSet(uniProps.Pi);
target |= asSet(uniProps.Pf);
target |= asSet(uniProps.Po);
}
else if (ucmp(name, "S") == 0 || ucmp(name, "Symbol") == 0)
{
target = asSet(uniProps.Sm);
target |= asSet(uniProps.Sc);
target |= asSet(uniProps.Sk);
target |= asSet(uniProps.So);
}
else if (ucmp(name, "Z") == 0 || ucmp(name, "Separator") == 0)
{
target = asSet(uniProps.Zs);
target |= asSet(uniProps.Zl);
target |= asSet(uniProps.Zp);
}
else if (ucmp(name, "C") == 0 || ucmp(name, "Other") == 0)
{
target = asSet(uniProps.Co);
target |= asSet(uniProps.Lo);
target |= asSet(uniProps.No);
target |= asSet(uniProps.So);
target |= asSet(uniProps.Po);
}
else if (ucmp(name, "graphical") == 0)
{
target = asSet(uniProps.Alphabetic);
target |= asSet(uniProps.Mn);
target |= asSet(uniProps.Mc);
target |= asSet(uniProps.Me);
target |= asSet(uniProps.Nd);
target |= asSet(uniProps.Nl);
target |= asSet(uniProps.No);
target |= asSet(uniProps.Pc);
target |= asSet(uniProps.Pd);
target |= asSet(uniProps.Ps);
target |= asSet(uniProps.Pe);
target |= asSet(uniProps.Pi);
target |= asSet(uniProps.Pf);
target |= asSet(uniProps.Po);
target |= asSet(uniProps.Zs);
target |= asSet(uniProps.Sm);
target |= asSet(uniProps.Sc);
target |= asSet(uniProps.Sk);
target |= asSet(uniProps.So);
}
else if (ucmp(name, "any") == 0)
target = Set.fromIntervals(0, 0x110000);
else if (ucmp(name, "ascii") == 0)
target = Set.fromIntervals(0, 0x80);
else
return loadUnicodeSet!(uniProps.tab)(name, target);
return true;
}
// CTFE-only helper for checking property names at compile-time
@safe bool isPrettyPropertyName(C)(const scope C[] name)
{
import std.algorithm.searching : find;
auto names = [
"L", "Letter",
"LC", "Cased Letter",
"M", "Mark",
"N", "Number",
"P", "Punctuation",
"S", "Symbol",
"Z", "Separator",
"Graphical",
"any",
"ascii"
];
auto x = find!(x => comparePropertyName(x, name) == 0)(names);
return !x.empty;
}
// ditto, CTFE-only, not optimized
@safe private static bool findSetName(alias table, C)(const scope C[] name)
{
return findUnicodeSet!table(name) >= 0;
}
template SetSearcher(alias table, string kind)
{
/// Run-time checked search.
static auto opCall(C)(const scope C[] name)
if (is(C : dchar))
{
import std.conv : to;
CodepointSet set;
if (loadUnicodeSet!table(name, set))
return set;
throw new Exception("No unicode set for "~kind~" by name "
~name.to!string()~" was found.");
}
/// Compile-time checked search.
static @property auto opDispatch(string name)()
{
static if (findSetName!table(name))
{
CodepointSet set;
loadUnicodeSet!table(name, set);
return set;
}
else
static assert(false, "No unicode set for "~kind~" by name "
~name~" was found.");
}
}
// Characters that need escaping in string posed as regular expressions
package(std) alias Escapables = AliasSeq!('[', ']', '\\', '^', '$', '.', '|', '?', ',', '-',
';', ':', '#', '&', '%', '/', '<', '>', '`', '*', '+', '(', ')', '{', '}', '~');
package(std) CodepointSet memoizeExpr(string expr)()
{
if (__ctfe)
return mixin(expr);
alias T = typeof(mixin(expr));
static T slot;
static bool initialized;
if (!initialized)
{
slot = mixin(expr);
initialized = true;
}
return slot;
}
//property for \w character class
package(std) @property CodepointSet wordCharacter() @safe
{
return memoizeExpr!("unicode.Alphabetic | unicode.Mn | unicode.Mc
| unicode.Me | unicode.Nd | unicode.Pc")();
}
//basic stack, just in case it gets used anywhere else then Parser
package(std) struct Stack(T)
{
@safe:
T[] data;
@property bool empty(){ return data.empty; }
@property size_t length(){ return data.length; }
void push(T val){ data ~= val; }
@trusted T pop()
{
assert(!empty);
auto val = data[$ - 1];
data = data[0 .. $ - 1];
if (!__ctfe)
cast(void) data.assumeSafeAppend();
return val;
}
@property ref T top()
{
assert(!empty);
return data[$ - 1];
}
}
//test if a given string starts with hex number of maxDigit that's a valid codepoint
//returns it's value and skips these maxDigit chars on success, throws on failure
package(std) dchar parseUniHex(Range)(ref Range str, size_t maxDigit)
{
import std.exception : enforce;
//std.conv.parse is both @system and bogus
uint val;
for (int k = 0; k < maxDigit; k++)
{
enforce(!str.empty, "incomplete escape sequence");
//accepts ascii only, so it's OK to index directly
immutable current = str.front;
if ('0' <= current && current <= '9')
val = val * 16 + current - '0';
else if ('a' <= current && current <= 'f')
val = val * 16 + current -'a' + 10;
else if ('A' <= current && current <= 'F')
val = val * 16 + current - 'A' + 10;
else
throw new Exception("invalid escape sequence");
str.popFront();
}
enforce(val <= 0x10FFFF, "invalid codepoint");
return val;
}
@safe unittest
{
import std.algorithm.searching : canFind;
import std.exception : collectException;
string[] non_hex = [ "000j", "000z", "FffG", "0Z"];
string[] hex = [ "01", "ff", "00af", "10FFFF" ];
int[] value = [ 1, 0xFF, 0xAF, 0x10FFFF ];
foreach (v; non_hex)
assert(collectException(parseUniHex(v, v.length)).msg
.canFind("invalid escape sequence"));
foreach (i, v; hex)
assert(parseUniHex(v, v.length) == value[i]);
string over = "0011FFFF";
assert(collectException(parseUniHex(over, over.length)).msg
.canFind("invalid codepoint"));
}
auto caseEnclose(CodepointSet set)
{
auto cased = set & unicode.LC;
foreach (dchar ch; cased.byCodepoint)
{
foreach (c; simpleCaseFoldings(ch))
set |= c;
}
return set;
}
/+
fetch codepoint set corresponding to a name (InBlock or binary property)
+/
CodepointSet getUnicodeSet(const scope char[] name, bool negated, bool casefold) @safe
{
CodepointSet s = unicode(name);
//FIXME: caseEnclose for new uni as Set | CaseEnclose(SET && LC)
if (casefold)
s = caseEnclose(s);
if (negated)
s = s.inverted;
return s;
}
struct UnicodeSetParser(Range)
{
import std.exception : enforce;
import std.typecons : tuple, Tuple;
Range range;
bool casefold_;
@property bool empty(){ return range.empty; }
@property dchar front(){ return range.front; }
void popFront(){ range.popFront(); }
//CodepointSet operations relatively in order of priority
enum Operator:uint {
Open = 0, Negate, Difference, SymDifference, Intersection, Union, None
}
//parse unit of CodepointSet spec, most notably escape sequences and char ranges
//also fetches next set operation
Tuple!(CodepointSet,Operator) parseCharTerm()
{
import std.range : drop;
enum privateUseStart = '\U000F0000', privateUseEnd ='\U000FFFFD';
enum State{ Start, Char, Escape, CharDash, CharDashEscape,
PotentialTwinSymbolOperator }
Operator op = Operator.None;
dchar last;
CodepointSet set;
State state = State.Start;
void addWithFlags(ref CodepointSet set, uint ch)
{
if (casefold_)
{
auto range = simpleCaseFoldings(ch);
foreach (v; range)
set |= v;
}
else
set |= ch;
}
static Operator twinSymbolOperator(dchar symbol)
{
switch (symbol)
{
case '|':
return Operator.Union;
case '-':
return Operator.Difference;
case '~':
return Operator.SymDifference;
case '&':
return Operator.Intersection;
default:
assert(false);
}
}
L_CharTermLoop:
for (;;)
{
final switch (state)
{
case State.Start:
switch (front)
{
case '|':
case '-':
case '~':
case '&':
state = State.PotentialTwinSymbolOperator;
last = front;
break;
case '[':
op = Operator.Union;
goto case;
case ']':
break L_CharTermLoop;
case '\\':
state = State.Escape;
break;
default:
state = State.Char;
last = front;
}
break;
case State.Char:
// xxx last front xxx
switch (front)
{
case '|':
case '~':
case '&':
// then last is treated as normal char and added as implicit union
state = State.PotentialTwinSymbolOperator;
addWithFlags(set, last);
last = front;
break;
case '-': // still need more info
state = State.CharDash;
break;
case '\\':
set |= last;
state = State.Escape;
break;
case '[':
op = Operator.Union;
goto case;
case ']':
addWithFlags(set, last);
break L_CharTermLoop;
default:
state = State.Char;
addWithFlags(set, last);
last = front;
}
break;
case State.PotentialTwinSymbolOperator:
// xxx last front xxxx
// where last = [|-&~]
if (front == last)
{
op = twinSymbolOperator(last);
popFront();//skip second twin char
break L_CharTermLoop;
}
goto case State.Char;
case State.Escape:
// xxx \ front xxx
switch (front)
{
case 'f':
last = '\f';
state = State.Char;
break;
case 'n':
last = '\n';
state = State.Char;
break;
case 'r':
last = '\r';
state = State.Char;
break;
case 't':
last = '\t';
state = State.Char;
break;
case 'v':
last = '\v';
state = State.Char;
break;
case 'c':
last = unicode.parseControlCode(this);
state = State.Char;
break;
foreach (val; Escapables)
{
case val:
}
last = front;
state = State.Char;
break;
case 'p':
set.add(unicode.parsePropertySpec(this, false, casefold_));
state = State.Start;
continue L_CharTermLoop; //next char already fetched
case 'P':
set.add(unicode.parsePropertySpec(this, true, casefold_));
state = State.Start;
continue L_CharTermLoop; //next char already fetched
case 'x':
popFront();
last = parseUniHex(this, 2);
state = State.Char;
continue L_CharTermLoop;
case 'u':
popFront();
last = parseUniHex(this, 4);
state = State.Char;
continue L_CharTermLoop;
case 'U':
popFront();
last = parseUniHex(this, 8);
state = State.Char;
continue L_CharTermLoop;
case 'd':
set.add(unicode.Nd);
state = State.Start;
break;
case 'D':
set.add(unicode.Nd.inverted);
state = State.Start;
break;
case 's':
set.add(unicode.White_Space);
state = State.Start;
break;
case 'S':
set.add(unicode.White_Space.inverted);
state = State.Start;
break;
case 'w':
set.add(wordCharacter);
state = State.Start;
break;
case 'W':
set.add(wordCharacter.inverted);
state = State.Start;
break;
default:
if (front >= privateUseStart && front <= privateUseEnd)
enforce(false, "no matching ']' found while parsing character class");
enforce(false, "invalid escape sequence");
}
break;
case State.CharDash:
// xxx last - front xxx
switch (front)
{
case '[':
op = Operator.Union;
goto case;
case ']':
//means dash is a single char not an interval specifier
addWithFlags(set, last);
addWithFlags(set, '-');
break L_CharTermLoop;
case '-'://set Difference again
addWithFlags(set, last);
op = Operator.Difference;
popFront();//skip '-'
break L_CharTermLoop;
case '\\':
state = State.CharDashEscape;
break;
default:
enforce(last <= front, "inverted range");
if (casefold_)
{
for (uint ch = last; ch <= front; ch++)
addWithFlags(set, ch);
}
else
set.add(last, front + 1);
state = State.Start;
}
break;
case State.CharDashEscape:
//xxx last - \ front xxx
uint end;
switch (front)
{
case 'f':
end = '\f';
break;
case 'n':
end = '\n';
break;
case 'r':
end = '\r';
break;
case 't':
end = '\t';
break;
case 'v':
end = '\v';
break;
foreach (val; Escapables)
{
case val:
}
end = front;
break;
case 'c':
end = unicode.parseControlCode(this);
break;
case 'x':
popFront();
end = parseUniHex(this, 2);
enforce(last <= end,"inverted range");
set.add(last, end + 1);
state = State.Start;
continue L_CharTermLoop;
case 'u':
popFront();
end = parseUniHex(this, 4);
enforce(last <= end,"inverted range");
set.add(last, end + 1);
state = State.Start;
continue L_CharTermLoop;
case 'U':
popFront();
end = parseUniHex(this, 8);
enforce(last <= end,"inverted range");
set.add(last, end + 1);
state = State.Start;
continue L_CharTermLoop;
default:
if (front >= privateUseStart && front <= privateUseEnd)
enforce(false, "no matching ']' found while parsing character class");
enforce(false, "invalid escape sequence");
}
// Lookahead to check if it's a \T
// where T is sub-pattern terminator in multi-pattern scheme
auto lookahead = range.save.drop(1);
if (end == '\\' && !lookahead.empty)
{
if (lookahead.front >= privateUseStart && lookahead.front <= privateUseEnd)
enforce(false, "no matching ']' found while parsing character class");
}
enforce(last <= end,"inverted range");
set.add(last, end + 1);
state = State.Start;
break;
}
popFront();
enforce(!empty, "unexpected end of CodepointSet");
}
return tuple(set, op);
}
alias ValStack = Stack!(CodepointSet);
alias OpStack = Stack!(Operator);
CodepointSet parseSet()
{
ValStack vstack;
OpStack opstack;
import std.functional : unaryFun;
enforce(!empty, "unexpected end of input");
enforce(front == '[', "expected '[' at the start of unicode set");
//
static bool apply(Operator op, ref ValStack stack)
{
switch (op)
{
case Operator.Negate:
enforce(!stack.empty, "no operand for '^'");
stack.top = stack.top.inverted;
break;
case Operator.Union:
auto s = stack.pop();//2nd operand
enforce(!stack.empty, "no operand for '||'");
stack.top.add(s);
break;
case Operator.Difference:
auto s = stack.pop();//2nd operand
enforce(!stack.empty, "no operand for '--'");
stack.top.sub(s);
break;
case Operator.SymDifference:
auto s = stack.pop();//2nd operand
enforce(!stack.empty, "no operand for '~~'");
stack.top ~= s;
break;
case Operator.Intersection:
auto s = stack.pop();//2nd operand
enforce(!stack.empty, "no operand for '&&'");
stack.top.intersect(s);
break;
default:
return false;
}
return true;
}
static bool unrollWhile(alias cond)(ref ValStack vstack, ref OpStack opstack)
{
while (cond(opstack.top))
{
if (!apply(opstack.pop(),vstack))
return false;//syntax error
if (opstack.empty)
return false;
}
return true;
}
L_CharsetLoop:
do
{
switch (front)
{
case '[':
opstack.push(Operator.Open);
popFront();
enforce(!empty, "unexpected end of character class");
if (front == '^')
{
opstack.push(Operator.Negate);
popFront();
enforce(!empty, "unexpected end of character class");
}
else if (front == ']') // []...] is special cased
{
popFront();
enforce(!empty, "wrong character set");
auto pair = parseCharTerm();
pair[0].add(']', ']'+1);
if (pair[1] != Operator.None)
{
if (opstack.top == Operator.Union)
unrollWhile!(unaryFun!"a == a.Union")(vstack, opstack);
opstack.push(pair[1]);
}
vstack.push(pair[0]);
}
break;
case ']':
enforce(unrollWhile!(unaryFun!"a != a.Open")(vstack, opstack),
"character class syntax error");
enforce(!opstack.empty, "unmatched ']'");
opstack.pop();
popFront();
if (opstack.empty)
break L_CharsetLoop;
auto pair = parseCharTerm();
if (!pair[0].empty)//not only operator e.g. -- or ~~
{
vstack.top.add(pair[0]);//apply union
}
if (pair[1] != Operator.None)
{
if (opstack.top == Operator.Union)
unrollWhile!(unaryFun!"a == a.Union")(vstack, opstack);
opstack.push(pair[1]);
}
break;
//
default://yet another pair of term(op)?
auto pair = parseCharTerm();
if (pair[1] != Operator.None)
{
if (opstack.top == Operator.Union)
unrollWhile!(unaryFun!"a == a.Union")(vstack, opstack);
opstack.push(pair[1]);
}
vstack.push(pair[0]);
}
}while (!empty || !opstack.empty);
while (!opstack.empty)
apply(opstack.pop(),vstack);
assert(vstack.length == 1);
return vstack.top;
}
}
/**
A single entry point to lookup Unicode $(CODEPOINT) sets by name or alias of
a block, script or general category.
It uses well defined standard rules of property name lookup.
This includes fuzzy matching of names, so that
'White_Space', 'white-SpAce' and 'whitespace' are all considered equal
and yield the same set of white space $(CHARACTERS).
*/
@safe public struct unicode
{
import std.exception : enforce;
/**
Performs the lookup of set of $(CODEPOINTS)
with compile-time correctness checking.
This short-cut version combines 3 searches:
across blocks, scripts, and common binary properties.
Note that since scripts and blocks overlap the
usual trick to disambiguate is used - to get a block use
`unicode.InBlockName`, to search a script
use `unicode.ScriptName`.
See_Also: $(LREF block), $(LREF script)
and (not included in this search) $(LREF hangulSyllableType).
*/
static @property auto opDispatch(string name)() pure
{
static if (findAny(name))
return loadAny(name);
else
static assert(false, "No unicode set by name "~name~" was found.");
}
///
@safe unittest
{
import std.exception : collectException;
auto ascii = unicode.ASCII;
assert(ascii['A']);
assert(ascii['~']);
assert(!ascii['\u00e0']);
// matching is case-insensitive
assert(ascii == unicode.ascII);
assert(!ascii['à']);
// underscores, '-' and whitespace in names are ignored too
auto latin = unicode.in_latin1_Supplement;
assert(latin['à']);
assert(!latin['$']);
// BTW Latin 1 Supplement is a block, hence "In" prefix
assert(latin == unicode("In Latin 1 Supplement"));
// run-time look up throws if no such set is found
assert(collectException(unicode("InCyrilliac")));
}
/**
The same lookup across blocks, scripts, or binary properties,
but performed at run-time.
This version is provided for cases where `name`
is not known beforehand; otherwise compile-time
checked $(LREF opDispatch) is typically a better choice.
See the $(S_LINK Unicode properties, table of properties) for available
sets.
*/
static auto opCall(C)(const scope C[] name)
if (is(C : dchar))
{
return loadAny(name);
}
/**
Narrows down the search for sets of $(CODEPOINTS) to all Unicode blocks.
Note:
Here block names are unambiguous as no scripts are searched
and thus to search use simply `unicode.block.BlockName` notation.
See $(S_LINK Unicode properties, table of properties) for available sets.
See_Also: $(S_LINK Unicode properties, table of properties).
*/
struct block
{
import std.internal.unicode_tables : blocks; // generated file
mixin SetSearcher!(blocks.tab, "block");
}
///
@safe unittest
{
// use .block for explicitness
assert(unicode.block.Greek_and_Coptic == unicode.InGreek_and_Coptic);
}
/**
Narrows down the search for sets of $(CODEPOINTS) to all Unicode scripts.
See the $(S_LINK Unicode properties, table of properties) for available
sets.
*/
struct script
{
import std.internal.unicode_tables : scripts; // generated file
mixin SetSearcher!(scripts.tab, "script");
}
///
@safe unittest
{
auto arabicScript = unicode.script.arabic;
auto arabicBlock = unicode.block.arabic;
// there is an intersection between script and block
assert(arabicBlock['']);
assert(arabicScript['']);
// but they are different
assert(arabicBlock != arabicScript);
assert(arabicBlock == unicode.inArabic);
assert(arabicScript == unicode.arabic);
}
/**
Fetch a set of $(CODEPOINTS) that have the given hangul syllable type.
Other non-binary properties (once supported) follow the same
notation - `unicode.propertyName.propertyValue` for compile-time
checked access and `unicode.propertyName(propertyValue)`
for run-time checked one.
See the $(S_LINK Unicode properties, table of properties) for available
sets.
*/
struct hangulSyllableType
{
import std.internal.unicode_tables : hangul; // generated file
mixin SetSearcher!(hangul.tab, "hangul syllable type");
}
///
@safe unittest
{
// L here is syllable type not Letter as in unicode.L short-cut
auto leadingVowel = unicode.hangulSyllableType("L");
// check that some leading vowels are present
foreach (vowel; '\u1110'..'\u115F')
assert(leadingVowel[vowel]);
assert(leadingVowel == unicode.hangulSyllableType.L);
}
//parse control code of form \cXXX, c assumed to be the current symbol
static package(std) dchar parseControlCode(Parser)(ref Parser p)
{
with(p)
{
popFront();
enforce(!empty, "Unfinished escape sequence");
enforce(('a' <= front && front <= 'z')
|| ('A' <= front && front <= 'Z'),
"Only letters are allowed after \\c");
return front & 0x1f;
}
}
//parse and return a CodepointSet for \p{...Property...} and \P{...Property..},
//\ - assumed to be processed, p - is current
static package(std) CodepointSet parsePropertySpec(Range)(ref Range p,
bool negated, bool casefold)
{
static import std.ascii;
with(p)
{
enum MAX_PROPERTY = 128;
char[MAX_PROPERTY] result;
uint k = 0;
popFront();
enforce(!empty, "eof parsing unicode property spec");
if (front == '{')
{
popFront();
while (k < MAX_PROPERTY && !empty && front !='}'
&& front !=':')
{
if (front != '-' && front != ' ' && front != '_')
result[k++] = cast(char) std.ascii.toLower(front);
popFront();
}
enforce(k != MAX_PROPERTY, "invalid property name");
enforce(front == '}', "} expected ");
}
else
{//single char properties e.g.: \pL, \pN ...
enforce(front < 0x80, "invalid property name");
result[k++] = cast(char) front;
}
auto s = getUnicodeSet(result[0 .. k], negated, casefold);
enforce(!s.empty, "unrecognized unicode property spec");
popFront();
return s;
}
}
/**
Parse unicode codepoint set from given `range` using standard regex
syntax '[...]'. The range is advanced skiping over regex set definition.
`casefold` parameter determines if the set should be casefolded - that is
include both lower and upper case versions for any letters in the set.
*/
static CodepointSet parseSet(Range)(ref Range range, bool casefold=false)
if (isInputRange!Range && is(ElementType!Range : dchar))
{
auto usParser = UnicodeSetParser!Range(range, casefold);
auto set = usParser.parseSet();
range = usParser.range;
return set;
}
///
@safe unittest
{
import std.uni : unicode;
string pat = "[a-zA-Z0-9]hello";
auto set = unicode.parseSet(pat);
// check some of the codepoints
assert(set['a'] && set['A'] && set['9']);
assert(pat == "hello");
}
private:
alias ucmp = comparePropertyName;
static bool findAny(string name)
{
import std.internal.unicode_tables : blocks, scripts, uniProps; // generated file
return isPrettyPropertyName(name)
|| findSetName!(uniProps.tab)(name) || findSetName!(scripts.tab)(name)
|| (ucmp(name[0 .. 2],"In") == 0 && findSetName!(blocks.tab)(name[2..$]));
}
static auto loadAny(Set=CodepointSet, C)(const scope C[] name) pure
{
import std.conv : to;
import std.internal.unicode_tables : blocks, scripts; // generated file
Set set;
immutable loaded = loadProperty(name, set) || loadUnicodeSet!(scripts.tab)(name, set)
|| (name.length > 2 && ucmp(name[0 .. 2],"In") == 0
&& loadUnicodeSet!(blocks.tab)(name[2..$], set));
if (loaded)
return set;
throw new Exception("No unicode set by name "~name.to!string()~" was found.");
}
// FIXME: re-disable once the compiler is fixed
// Disabled to prevent the mistake of creating instances of this pseudo-struct.
//@disable ~this();
}
@safe unittest
{
import std.internal.unicode_tables : blocks, uniProps; // generated file
assert(unicode("InHebrew") == asSet(blocks.Hebrew));
assert(unicode("separator") == (asSet(uniProps.Zs) | asSet(uniProps.Zl) | asSet(uniProps.Zp)));
assert(unicode("In-Kharoshthi") == asSet(blocks.Kharoshthi));
}
enum EMPTY_CASE_TRIE = ushort.max;// from what gen_uni uses internally
// control - '\r'
enum controlSwitch = `
case '\u0000':..case '\u0008':case '\u000E':..case '\u001F':case '\u007F':..
case '\u0084':case '\u0086':..case '\u009F': case '\u0009':..case '\u000C': case '\u0085':
`;
// TODO: redo the most of hangul stuff algorithmically in case of Graphemes too
// kill unrolled switches
private static bool isRegionalIndicator(dchar ch) @safe pure @nogc nothrow
{
return ch >= '\U0001F1E6' && ch <= '\U0001F1FF';
}
template genericDecodeGrapheme(bool getValue)
{
alias graphemeExtend = graphemeExtendTrie;
alias spacingMark = mcTrie;
static if (getValue)
alias Value = Grapheme;
else
alias Value = void;
Value genericDecodeGrapheme(Input)(ref Input range)
{
import std.internal.unicode_tables : isHangL, isHangT, isHangV; // generated file
enum GraphemeState {
Start,
CR,
RI,
L,
V,
LVT
}
static if (getValue)
Grapheme grapheme;
auto state = GraphemeState.Start;
enum eat = q{
static if (getValue)
grapheme ~= ch;
range.popFront();
};
dchar ch;
assert(!range.empty, "Attempting to decode grapheme from an empty " ~ Input.stringof);
while (!range.empty)
{
ch = range.front;
final switch (state) with(GraphemeState)
{
case Start:
mixin(eat);
if (ch == '\r')
state = CR;
else if (isRegionalIndicator(ch))
state = RI;
else if (isHangL(ch))
state = L;
else if (hangLV[ch] || isHangV(ch))
state = V;
else if (hangLVT[ch])
state = LVT;
else if (isHangT(ch))
state = LVT;
else
{
switch (ch)
{
mixin(controlSwitch);
goto L_End;
default:
goto L_End_Extend;
}
}
break;
case CR:
if (ch == '\n')
mixin(eat);
goto L_End_Extend;
case RI:
if (isRegionalIndicator(ch))
mixin(eat);
else
goto L_End_Extend;
break;
case L:
if (isHangL(ch))
mixin(eat);
else if (isHangV(ch) || hangLV[ch])
{
state = V;
mixin(eat);
}
else if (hangLVT[ch])
{
state = LVT;
mixin(eat);
}
else
goto L_End_Extend;
break;
case V:
if (isHangV(ch))
mixin(eat);
else if (isHangT(ch))
{
state = LVT;
mixin(eat);
}
else
goto L_End_Extend;
break;
case LVT:
if (isHangT(ch))
{
mixin(eat);
}
else
goto L_End_Extend;
break;
}
}
L_End_Extend:
while (!range.empty)
{
ch = range.front;
// extend & spacing marks
if (!graphemeExtend[ch] && !spacingMark[ch])
break;
mixin(eat);
}
L_End:
static if (getValue)
return grapheme;
}
}
public: // Public API continues
/++
Computes the length of grapheme cluster starting at `index`.
Both the resulting length and the `index` are measured
in $(S_LINK Code unit, code units).
Params:
C = type that is implicitly convertible to `dchars`
input = array of grapheme clusters
index = starting index into `input[]`
Returns:
length of grapheme cluster
+/
size_t graphemeStride(C)(const scope C[] input, size_t index) @safe pure
if (is(C : dchar))
{
auto src = input[index..$];
auto n = src.length;
genericDecodeGrapheme!(false)(src);
return n - src.length;
}
///
@safe unittest
{
assert(graphemeStride(" ", 1) == 1);
// A + combing ring above
string city = "A\u030Arhus";
size_t first = graphemeStride(city, 0);
assert(first == 3); //\u030A has 2 UTF-8 code units
assert(city[0 .. first] == "A\u030A");
assert(city[first..$] == "rhus");
}
@safe unittest
{
// Ensure that graphemeStride is usable from CTFE.
enum c1 = graphemeStride("A", 0);
static assert(c1 == 1);
enum c2 = graphemeStride("A\u0301", 0);
static assert(c2 == 3); // \u0301 has 2 UTF-8 code units
}
/++
Reads one full grapheme cluster from an
$(REF_ALTTEXT input range, isInputRange, std,range,primitives) of dchar `inp`.
For examples see the $(LREF Grapheme) below.
Note:
This function modifies `inp` and thus `inp`
must be an L-value.
+/
Grapheme decodeGrapheme(Input)(ref Input inp)
if (isInputRange!Input && is(immutable ElementType!Input == immutable dchar))
{
return genericDecodeGrapheme!true(inp);
}
@safe unittest
{
import std.algorithm.comparison : equal;
Grapheme gr;
string s = " \u0020\u0308 ";
gr = decodeGrapheme(s);
assert(gr.length == 1 && gr[0] == ' ');
gr = decodeGrapheme(s);
assert(gr.length == 2 && equal(gr[0 .. 2], " \u0308"));
s = "\u0300\u0308\u1100";
assert(equal(decodeGrapheme(s)[], "\u0300\u0308"));
assert(equal(decodeGrapheme(s)[], "\u1100"));
s = "\u11A8\u0308\uAC01";
assert(equal(decodeGrapheme(s)[], "\u11A8\u0308"));
assert(equal(decodeGrapheme(s)[], "\uAC01"));
}
/++
$(P Iterate a string by $(LREF Grapheme).)
$(P Useful for doing string manipulation that needs to be aware
of graphemes.)
See_Also:
$(LREF byCodePoint)
+/
auto byGrapheme(Range)(Range range)
if (isInputRange!Range && is(immutable ElementType!Range == immutable dchar))
{
// TODO: Bidirectional access
static struct Result(R)
{
private R _range;
private Grapheme _front;
bool empty() @property
{
return _front.length == 0;
}
Grapheme front() @property
{
return _front;
}
void popFront()
{
_front = _range.empty ? Grapheme.init : _range.decodeGrapheme();
}
static if (isForwardRange!R)
{
Result save() @property
{
return Result(_range.save, _front);
}
}
}
auto result = Result!(Range)(range);
result.popFront();
return result;
}
///
@safe unittest
{
import std.algorithm.comparison : equal;
import std.range.primitives : walkLength;
import std.range : take, drop;
auto text = "noe\u0308l"; // noël using e + combining diaeresis
assert(text.walkLength == 5); // 5 code points
auto gText = text.byGrapheme;
assert(gText.walkLength == 4); // 4 graphemes
assert(gText.take(3).equal("noe\u0308".byGrapheme));
assert(gText.drop(3).equal("l".byGrapheme));
}
// For testing non-forward-range input ranges
version (StdUnittest)
private static @safe struct InputRangeString
{
private string s;
bool empty() @property { return s.empty; }
dchar front() @property { return s.front; }
void popFront() { s.popFront(); }
}
@safe unittest
{
import std.algorithm.comparison : equal;
import std.array : array;
import std.range : retro;
import std.range.primitives : walkLength;
assert("".byGrapheme.walkLength == 0);
auto reverse = "le\u0308on";
assert(reverse.walkLength == 5);
auto gReverse = reverse.byGrapheme;
assert(gReverse.walkLength == 4);
static foreach (text; AliasSeq!("noe\u0308l"c, "noe\u0308l"w, "noe\u0308l"d))
{{
assert(text.walkLength == 5);
static assert(isForwardRange!(typeof(text)));
auto gText = text.byGrapheme;
static assert(isForwardRange!(typeof(gText)));
assert(gText.walkLength == 4);
assert(gText.array.retro.equal(gReverse));
}}
auto nonForwardRange = InputRangeString("noe\u0308l").byGrapheme;
static assert(!isForwardRange!(typeof(nonForwardRange)));
assert(nonForwardRange.walkLength == 4);
}
/++
$(P Lazily transform a range of $(LREF Grapheme)s to a range of code points.)
$(P Useful for converting the result to a string after doing operations
on graphemes.)
$(P If passed in a range of code points, returns a range with equivalent capabilities.)
+/
auto byCodePoint(Range)(Range range)
if (isInputRange!Range && is(immutable ElementType!Range == immutable Grapheme))
{
// TODO: Propagate bidirectional access
static struct Result
{
private Range _range;
private size_t i = 0;
bool empty() @property
{
return _range.empty;
}
dchar front() @property
{
return _range.front[i];
}
void popFront()
{
++i;
if (i >= _range.front.length)
{
_range.popFront();
i = 0;
}
}
static if (isForwardRange!Range)
{
Result save() @property
{
return Result(_range.save, i);
}
}
}
return Result(range);
}
/// Ditto
auto byCodePoint(Range)(Range range)
if (isInputRange!Range && is(immutable ElementType!Range == immutable dchar))
{
import std.range.primitives : isBidirectionalRange, popBack;
import std.traits : isNarrowString;
static if (isNarrowString!Range)
{
static struct Result
{
private Range _range;
@property bool empty() { return _range.empty; }
@property dchar front(){ return _range.front; }
void popFront(){ _range.popFront; }
@property auto save() { return Result(_range.save); }
@property dchar back(){ return _range.back; }
void popBack(){ _range.popBack; }
}
static assert(isBidirectionalRange!(Result));
return Result(range);
}
else
return range;
}
///
@safe unittest
{
import std.array : array;
import std.conv : text;
import std.range : retro;
string s = "noe\u0308l"; // noël
// reverse it and convert the result to a string
string reverse = s.byGrapheme
.array
.retro
.byCodePoint
.text;
assert(reverse == "le\u0308on"); // lëon
}
@safe unittest
{
import std.algorithm.comparison : equal;
import std.range.primitives : walkLength;
import std.range : retro;
assert("".byGrapheme.byCodePoint.equal(""));
string text = "noe\u0308l";
static assert(!__traits(compiles, "noe\u0308l".byCodePoint.length));
auto gText = InputRangeString(text).byGrapheme;
static assert(!isForwardRange!(typeof(gText)));
auto cpText = gText.byCodePoint;
static assert(!isForwardRange!(typeof(cpText)));
assert(cpText.walkLength == text.walkLength);
auto plainCp = text.byCodePoint;
static assert(isForwardRange!(typeof(plainCp)));
assert(equal(plainCp, text));
assert(equal(retro(plainCp.save), retro(text.save)));
// Check that we still have length for dstring
assert("абвгд"d.byCodePoint.length == 5);
}
/++
$(P A structure designed to effectively pack $(CHARACTERS)
of a $(CLUSTER).
)
$(P `Grapheme` has value semantics so 2 copies of a `Grapheme`
always refer to distinct objects. In most actual scenarios a `Grapheme`
fits on the stack and avoids memory allocation overhead for all but quite
long clusters.
)
See_Also: $(LREF decodeGrapheme), $(LREF graphemeStride)
+/
@safe struct Grapheme
{
import std.exception : enforce;
import std.traits : isDynamicArray;
public:
/// Ctor
this(C)(const scope C[] chars...)
if (is(C : dchar))
{
this ~= chars;
}
///ditto
this(Input)(Input seq)
if (!isDynamicArray!Input
&& isInputRange!Input && is(ElementType!Input : dchar))
{
this ~= seq;
}
/// Gets a $(CODEPOINT) at the given index in this cluster.
dchar opIndex(size_t index) const @nogc nothrow pure @trusted
{
assert(index < length);
return read24(isBig ? ptr_ : small_.ptr, index);
}
/++
Writes a $(CODEPOINT) `ch` at given index in this cluster.
Warning:
Use of this facility may invalidate grapheme cluster,
see also $(LREF Grapheme.valid).
+/
void opIndexAssign(dchar ch, size_t index) @nogc nothrow pure @trusted
{
assert(index < length);
write24(isBig ? ptr_ : small_.ptr, ch, index);
}
///
@safe unittest
{
auto g = Grapheme("A\u0302");
assert(g[0] == 'A');
assert(g.valid);
g[1] = '~'; // ASCII tilda is not a combining mark
assert(g[1] == '~');
assert(!g.valid);
}
/++
Random-access range over Grapheme's $(CHARACTERS).
Warning: Invalidates when this Grapheme leaves the scope,
attempts to use it then would lead to memory corruption.
+/
SliceOverIndexed!Grapheme opSlice(size_t a, size_t b) @nogc nothrow pure return
{
return sliceOverIndexed(a, b, &this);
}
/// ditto
SliceOverIndexed!Grapheme opSlice() @nogc nothrow pure return
{
return sliceOverIndexed(0, length, &this);
}
/// Grapheme cluster length in $(CODEPOINTS).
@property size_t length() const @nogc nothrow pure
{
return isBig ? len_ : slen_ & 0x7F;
}
/++
Append $(CHARACTER) `ch` to this grapheme.
Warning:
Use of this facility may invalidate grapheme cluster,
see also `valid`.
See_Also: $(LREF Grapheme.valid)
+/
ref opOpAssign(string op)(dchar ch) @trusted
{
static if (op == "~")
{
import std.internal.memory : enforceRealloc;
if (!isBig)
{
if (slen_ == small_cap)
convertToBig();// & fallthrough to "big" branch
else
{
write24(small_.ptr, ch, smallLength);
slen_++;
return this;
}
}
assert(isBig);
if (len_ == cap_)
{
import core.checkedint : addu, mulu;
bool overflow;
cap_ = addu(cap_, grow, overflow);
auto nelems = mulu(3, addu(cap_, 1, overflow), overflow);
if (overflow) assert(0);
ptr_ = cast(ubyte*) enforceRealloc(ptr_, nelems);
}
write24(ptr_, ch, len_++);
return this;
}
else
static assert(false, "No operation "~op~" defined for Grapheme");
}
///
@safe unittest
{
import std.algorithm.comparison : equal;
auto g = Grapheme("A");
assert(g.valid);
g ~= '\u0301';
assert(g[].equal("A\u0301"));
assert(g.valid);
g ~= "B";
// not a valid grapheme cluster anymore
assert(!g.valid);
// still could be useful though
assert(g[].equal("A\u0301B"));
}
/// Append all $(CHARACTERS) from the input range `inp` to this Grapheme.
ref opOpAssign(string op, Input)(scope Input inp)
if (isInputRange!Input && is(ElementType!Input : dchar))
{
static if (op == "~")
{
foreach (dchar ch; inp)
this ~= ch;
return this;
}
else
static assert(false, "No operation "~op~" defined for Grapheme");
}
/++
True if this object contains valid extended grapheme cluster.
Decoding primitives of this module always return a valid `Grapheme`.
Appending to and direct manipulation of grapheme's $(CHARACTERS) may
render it no longer valid. Certain applications may chose to use
Grapheme as a "small string" of any $(CODEPOINTS) and ignore this property
entirely.
+/
@property bool valid()() /*const*/
{
auto r = this[];
genericDecodeGrapheme!false(r);
return r.length == 0;
}
this(this) @nogc nothrow pure @trusted
{
import std.internal.memory : enforceMalloc;
if (isBig)
{// dup it
import core.checkedint : addu, mulu;
bool overflow;
auto raw_cap = mulu(3, addu(cap_, 1, overflow), overflow);
if (overflow) assert(0);
auto p = cast(ubyte*) enforceMalloc(raw_cap);
p[0 .. raw_cap] = ptr_[0 .. raw_cap];
ptr_ = p;
}
}
~this() @nogc nothrow pure @trusted
{
import core.memory : pureFree;
if (isBig)
{
pureFree(ptr_);
}
}
private:
enum small_bytes = ((ubyte*).sizeof+3*size_t.sizeof-1);
// "out of the blue" grow rate, needs testing
// (though graphemes are typically small < 9)
enum grow = 20;
enum small_cap = small_bytes/3;
enum small_flag = 0x80, small_mask = 0x7F;
// 16 bytes in 32bits, should be enough for the majority of cases
union
{
struct
{
ubyte* ptr_;
size_t cap_;
size_t len_;
size_t padding_;
}
struct
{
ubyte[small_bytes] small_;
ubyte slen_;
}
}
void convertToBig() @nogc nothrow pure @trusted
{
import std.internal.memory : enforceMalloc;
static assert(grow.max / 3 - 1 >= grow);
enum nbytes = 3 * (grow + 1);
size_t k = smallLength;
ubyte* p = cast(ubyte*) enforceMalloc(nbytes);
for (int i=0; i