unicode

This module provides support to handle the Unicode UTF-8 encoding.

There are no specialized insert, delete, add and contains procedures for seq[Rune] in this module because the generic variants of these procedures in the system module already work with it.

Types

Rune = distinct RuneImpl
Unicode code point. Can hold any Unicode character.   Source Edit
Rune16 = distinct int16
16 bit Unicode character   Source Edit

Procs

proc `<=%`(a, b: Rune): bool {...}{.raises: [], tags: [].}
  Source Edit
proc `<%`(a, b: Rune): bool {...}{.raises: [], tags: [].}
  Source Edit
proc `==`(a, b: Rune): bool {...}{.raises: [], tags: [].}
  Source Edit
proc runeLen(s: string): int {...}{.gcsafe, extern: "nuc$1", raises: [], tags: [].}
Returns the number of runes of the string s.

Examples:

let a = "añyóng"
doAssert a.runeLen == 6
## note: a.len == 8
  Source Edit
proc runeLenAt(s: string; i: Natural): int {...}{.raises: [], tags: [].}
Returns the number of bytes the rune starting at s[i] takes.

Examples:

let a = "añyóng"
doAssert a.runeLenAt(0) == 1
doAssert a.runeLenAt(1) == 2
  Source Edit
proc validateUtf8(s: string): int {...}{.raises: [], tags: [].}
Returns the position of the invalid byte in s if the string s does not hold valid UTF-8 data. Otherwise -1 is returned.   Source Edit
proc runeAt(s: string; i: Natural): Rune {...}{.raises: [], tags: [].}
Returns the rune in s at byte index i.

Examples:

let a = "añyóng"
doAssert a.runeAt(1) == "ñ".runeAt(0)
doAssert a.runeAt(2) == "ñ".runeAt(1)
doAssert a.runeAt(3) == "y".runeAt(0)
  Source Edit
proc toUTF8(c: Rune): string {...}{.gcsafe, extern: "nuc$1", raises: [], tags: [].}
Converts a rune into its UTF-8 representation.

Examples:

let a = "añyóng"
doAssert a.runeAt(1).toUTF8 == "ñ"
  Source Edit
proc add(s: var string; c: Rune) {...}{.raises: [], tags: [].}
Adds a rune c to a string s.

Examples:

var s = "abc"
let c = "ä".runeAt(0)
s.add(c)
doAssert s == "abcä"
  Source Edit
proc `$`(rune: Rune): string {...}{.raises: [], tags: [].}
An alias for toUTF8.   Source Edit
proc `$`(runes: seq[Rune]): string {...}{.raises: [], tags: [].}
Converts a sequence of Runes to a string.   Source Edit
proc runeOffset(s: string; pos: Natural; start: Natural = 0): int {...}{.raises: [], tags: [].}

Returns the byte position of rune at position pos in s with an optional start byte position. Returns the special value -1 if it runs out of the string.

Beware: This can lead to unoptimized code and slow execution! Most problems can be solved more efficiently by using an iterator or conversion to a seq of Rune.

  Source Edit
proc runeAtPos(s: string; pos: int): Rune {...}{.raises: [], tags: [].}

Returns the rune at position pos.

Beware: This can lead to unoptimized code and slow execution! Most problems can be solved more efficiently by using an iterator or conversion to a seq of Rune.

  Source Edit
proc runeStrAtPos(s: string; pos: Natural): string {...}{.raises: [], tags: [].}

Returns the rune at position pos as UTF8 String.

Beware: This can lead to unoptimized code and slow execution! Most problems can be solved more efficiently by using an iterator or conversion to a seq of Rune.

  Source Edit
proc runeReverseOffset(s: string; rev: Positive): (int, int) {...}{.raises: [], tags: [].}

Returns a tuple with the the byte offset of the rune at position rev in s, counting from the end (starting with 1) and the total number of runes in the string. Returns a negative value for offset if there are to few runes in the string to satisfy the request.

Beware: This can lead to unoptimized code and slow execution! Most problems can be solved more efficiently by using an iterator or conversion to a seq of Rune.

  Source Edit
proc runeSubStr(s: string; pos: int; len: int = int.high): string {...}{.raises: [], tags: [].}
Returns the UTF-8 substring starting at codepoint pos with len codepoints. If pos or len is negative they count from the end of the string. If len is not given it means the longest possible string.

Examples:

let s = "Hänsel  ««: 10,00€"
doAssert(runeSubStr(s, 0, 2) == "Hä")
doAssert(runeSubStr(s, 10, 1) == ":")
doAssert(runeSubStr(s, -6) == "10,00€")
doAssert(runeSubStr(s, 10) == ": 10,00€")
doAssert(runeSubStr(s, 12, 5) == "10,00")
doAssert(runeSubStr(s, -6, 3) == "10,")
  Source Edit
proc toLower(c: Rune): Rune {...}{.gcsafe, extern: "nuc$1", procvar, raises: [], tags: [].}
Converts c into lower case. This works for any rune. If possible, prefer toLower over toUpper.   Source Edit
proc toUpper(c: Rune): Rune {...}{.gcsafe, extern: "nuc$1", procvar, raises: [], tags: [].}
Converts c into upper case. This works for any rune. If possible, prefer toLower over toUpper.   Source Edit
proc toTitle(c: Rune): Rune {...}{.gcsafe, extern: "nuc$1", procvar, raises: [], tags: [].}
Converts c to title case.   Source Edit
proc isLower(c: Rune): bool {...}{.gcsafe, extern: "nuc$1", procvar, raises: [], tags: [].}
Returns true iff c is a lower case rune. If possible, prefer isLower over isUpper.   Source Edit
proc isUpper(c: Rune): bool {...}{.gcsafe, extern: "nuc$1", procvar, raises: [], tags: [].}
Returns true iff c is a upper case rune. If possible, prefer isLower over isUpper.   Source Edit
proc isAlpha(c: Rune): bool {...}{.gcsafe, extern: "nuc$1", procvar, raises: [], tags: [].}
Returns true iff c is an alpha rune (i.e., a letter)   Source Edit
proc isTitle(c: Rune): bool {...}{.gcsafe, extern: "nuc$1", procvar, raises: [], tags: [].}
Returns true iff c is a Unicode titlecase character.   Source Edit
proc isWhiteSpace(c: Rune): bool {...}{.gcsafe, extern: "nuc$1", procvar, raises: [], tags: [].}
Returns true iff c is a Unicode whitespace character.   Source Edit
proc isCombining(c: Rune): bool {...}{.gcsafe, extern: "nuc$1", procvar, raises: [], tags: [].}
Returns true iff c is a Unicode combining character.   Source Edit
proc isAlpha(s: string): bool {...}{.noSideEffect, procvar, gcsafe, extern: "nuc$1Str",
                            raises: [], tags: [].}
Returns true iff s contains all alphabetic runes.   Source Edit
proc isSpace(s: string): bool {...}{.noSideEffect, procvar, gcsafe, extern: "nuc$1Str",
                            raises: [], tags: [].}
Returns true iff s contains all whitespace runes.   Source Edit
proc isLower(s: string; skipNonAlpha: bool): bool {...}{.deprecated: "Deprecated since version 0.20 since its semantics are unclear",
    raises: [], tags: [].}

Checks whether s is lower case.

If skipNonAlpha is true, returns true if all alphabetical runes in s are lower case. Returns false if none of the runes in s are alphabetical.

If skipNonAlpha is false, returns true only if all runes in s are alphabetical and lower case.

For either value of skipNonAlpha, returns false if s is an empty string.

  Source Edit
proc isUpper(s: string; skipNonAlpha: bool): bool {...}{.deprecated: "Deprecated since version 0.20 since its semantics are unclear",
    raises: [], tags: [].}

Checks whether s is upper case.

If skipNonAlpha is true, returns true if all alphabetical runes in s are upper case. Returns false if none of the runes in s are alphabetical.

If skipNonAlpha is false, returns true only if all runes in s are alphabetical and upper case.

For either value of skipNonAlpha, returns false if s is an empty string.

  Source Edit
proc toUpper(s: string): string {...}{.noSideEffect, procvar, gcsafe, extern: "nuc$1Str",
                              raises: [], tags: [].}
Converts s into upper-case runes.   Source Edit
proc toLower(s: string): string {...}{.noSideEffect, procvar, gcsafe, extern: "nuc$1Str",
                              raises: [], tags: [].}
Converts s into lower-case runes.   Source Edit
proc swapCase(s: string): string {...}{.noSideEffect, procvar, gcsafe, extern: "nuc$1",
                               raises: [], tags: [].}

Swaps the case of runes in s.

Returns a new string such that the cases of all runes are swapped if possible.

  Source Edit
proc capitalize(s: string): string {...}{.noSideEffect, procvar, gcsafe, extern: "nuc$1",
                                 raises: [], tags: [].}
Converts the first character of s into an upper-case rune.   Source Edit
proc translate(s: string; replacements: proc (key: string): string): string {...}{.gcsafe,
    extern: "nuc$1", raises: [], tags: [].}

Translates words in a string using the replacements proc to substitute words inside s with their replacements.

replacements is any proc that takes a word and returns a new word to fill it's place.

  Source Edit
proc title(s: string): string {...}{.noSideEffect, procvar, gcsafe, extern: "nuc$1",
                            raises: [], tags: [].}

Converts s to a unicode title.

Returns a new string such that the first character in each word inside s is capitalized.

  Source Edit
proc isTitle(s: string): bool {...}{.noSideEffect, procvar, gcsafe, extern: "nuc$1Str", deprecated: "Deprecated since version 0.20 since its semantics are unclear",
                            raises: [], tags: [].}

Checks whether or not s is a unicode title.

Returns true if the first character in each word inside s are upper case and there is at least one character in s.

  Source Edit
proc toRunes(s: string): seq[Rune] {...}{.raises: [], tags: [].}
Obtains a sequence containing the Runes in s.   Source Edit
proc cmpRunesIgnoreCase(a, b: string): int {...}{.gcsafe, extern: "nuc$1", procvar,
                                        raises: [], tags: [].}
Compares two UTF-8 strings and ignores the case. Returns:

0 iff a == b
< 0 iff a < b
> 0 iff a > b

  Source Edit
proc reversed(s: string): string {...}{.raises: [], tags: [].}
Returns the reverse of s, interpreting it as runes. Unicode combining characters are correctly interpreted as well.

Examples:

assert reversed("Reverse this!") == "!siht esreveR"
assert reversed("先秦兩漢") == "漢兩秦先"
assert reversed("as⃝df̅") == "f̅ds⃝a"
assert reversed("a⃞b⃞c⃞") == "c⃞b⃞a⃞"
  Source Edit
proc graphemeLen(s: string; i: Natural): Natural {...}{.raises: [], tags: [].}
The number of bytes belonging to s[i] including following combining characters.   Source Edit
proc lastRune(s: string; last: int): (Rune, int) {...}{.raises: [], tags: [].}
Length of the last rune in s[0..last]. Returns the rune and its length in bytes.   Source Edit
proc size(r: Rune): int {...}{.noSideEffect, raises: [], tags: [].}
Returns the number of bytes the rune r takes.   Source Edit
proc splitWhitespace(s: string): seq[string] {...}{.noSideEffect, gcsafe,
    extern: "ncuSplitWhitespace", raises: [], tags: [].}
The same as the splitWhitespace iterator, but is a proc that returns a sequence of substrings.   Source Edit
proc split(s: string; seps: openArray[Rune] = unicodeSpaces; maxsplit: int = -1): seq[
    string] {...}{.noSideEffect, gcsafe, extern: "nucSplitRunes", raises: [], tags: [].}
The same as the split iterator, but is a proc that returns a sequence of substrings.   Source Edit
proc split(s: string; sep: Rune; maxsplit: int = -1): seq[string] {...}{.noSideEffect, gcsafe,
    extern: "nucSplitRune", raises: [], tags: [].}
The same as the split iterator, but is a proc that returns a sequence of substrings.   Source Edit
proc strip(s: string; leading = true; trailing = true;
          runes: openArray[Rune] = unicodeSpaces): string {...}{.noSideEffect, gcsafe,
    extern: "nucStrip", raises: [], tags: [].}

Strips leading or trailing runes from s and returns the resulting string.

If leading is true, leading runes are stripped. If trailing is true, trailing runes are stripped. If both are false, the string is returned unchanged.

  Source Edit
proc repeat(c: Rune; count: Natural): string {...}{.noSideEffect, gcsafe,
    extern: "nucRepeatRune", raises: [], tags: [].}

Returns a string of count Runes c.

The returned string will have a rune-length of count.

  Source Edit
proc align(s: string; count: Natural; padding = ' '.Rune): string {...}{.noSideEffect, gcsafe,
    extern: "nucAlignString", raises: [], tags: [].}

Aligns a unicode string s with padding, so that it has a rune-length of count.

padding characters (by default spaces) are added before s resulting in right alignment. If s.runelen >= count, no spaces are added and s is returned unchanged. If you need to left align a string use the alignLeft proc.

Examples:

assert align("abc", 4) == " abc"
assert align("a", 0) == "a"
assert align("1232", 6) == "  1232"
assert align("1232", 6, '#'.Rune) == "##1232"
assert align("Åge", 5) == "  Åge"
assert align("×", 4, '_'.Rune) == "___×"
  Source Edit
proc alignLeft(s: string; count: Natural; padding = ' '.Rune): string {...}{.noSideEffect,
    raises: [], tags: [].}

Left-Aligns a unicode string s with padding, so that it has a rune-length of count.

padding characters (by default spaces) are added after s resulting in left alignment. If s.runelen >= count, no spaces are added and s is returned unchanged. If you need to right align a string use the align proc.

Examples:

assert alignLeft("abc", 4) == "abc "
assert alignLeft("a", 0) == "a"
assert alignLeft("1232", 6) == "1232  "
assert alignLeft("1232", 6, '#'.Rune) == "1232##"
assert alignLeft("Åge", 5) == "Åge  "
assert alignLeft("×", 4, '_'.Rune) == "×___"
  Source Edit

Iterators

iterator runes(s: string): Rune {...}{.raises: [], tags: [].}
Iterates over any rune of the string s returning runes.   Source Edit
iterator utf8(s: string): string {...}{.raises: [], tags: [].}
Iterates over any rune of the string s returning utf8 values.   Source Edit
iterator split(s: string; seps: openArray[Rune] = unicodeSpaces; maxsplit: int = -1): string {...}{.
    raises: [], tags: [].}

Splits the unicode string s into substrings using a group of separators.

Substrings are separated by a substring containing only seps.

for word in split("this\lis an\texample"):
  writeLine(stdout, word)

...generates this output:

"this"
"is"
"an"
"example"

And the following code:

for word in split("this:is;an$example", {';', ':', '$'}):
  writeLine(stdout, word)

...produces the same output as the first example. The code:

let date = "2012-11-20T22:08:08.398990"
let separators = {' ', '-', ':', 'T'}
for number in split(date, separators):
  writeLine(stdout, number)

...results in:

"2012"
"11"
"20"
"22"
"08"
"08.398990"
  Source Edit
iterator splitWhitespace(s: string): string {...}{.raises: [], tags: [].}
Splits a unicode string at whitespace runes.   Source Edit
iterator split(s: string; sep: Rune; maxsplit: int = -1): string {...}{.raises: [], tags: [].}

Splits the unicode string s into substrings using a single separator.

Substrings are separated by the rune sep. The code:

for word in split(";;this;is;an;;example;;;", ';'):
  writeLine(stdout, word)

Results in:

""
""
"this"
"is"
"an"
""
"example"
""
""
""
  Source Edit

Templates

template fastRuneAt(s: string; i: int; result: untyped; doInc = true)
Returns the rune s[i] in result. If doInc == true i is incremented by the number of bytes that have been processed.   Source Edit
template fastToUTF8Copy(c: Rune; s: var string; pos: int; doInc = true)

Copies UTF-8 representation of c into the preallocated string s starting at position pos. If doInc == true, pos is incremented by the number of bytes that have been processed.

To be the most efficient, make sure s is preallocated with an additional amount equal to the byte length of c.

  Source Edit