**************** * SNOBOL4 functions to implement UTF-encoded unicode handling * $Id: utf.sno,v 1.2 2000/03/21 05:35:41 phil Exp $ * Phil Budne * September 1996 * updated for 4/5/6 byte runes June 1997 * Most of this is just AWFUL, but this is just a proof of * concept, and like all SNOBOL there is always a smaller, faster * and more perverse way to do anything. **************** * TODO; * * UTFNOTANY * UTFSPAN * UTFBREAK * UTFBREAKX **************** * Pattern to match one UTF encoded rune * SNOBOL4 lacks CHAR(), SPITBOL CHAR() loses on CHAR(0)! DEFINE("XCHAR(N)") :(EXCHAR) XCHAR &ALPHABET ARB POS(N) LEN(1) . XCHAR :S(RETURN)F(FRETURN) EXCHAR * helper function to pattern that matches a range of byte values. * the true SNO-head might implement this as a function that * takes strings of SPAN('01') ARBNO('x') directly!! DEFINE("RANGE(START,END)") :(ERANGE) RANGE * SPITBOL compiles ANY() into a bitmap; &ALPHABET ARB (XCHAR(START) ARB XCHAR(END)) . RANGE :F(FRETURN) * SPITBOL compiles tables once; RANGE = ANY(RANGE) :(RETURN) * for SNOBOL4; create big pattern * GE(START,END) :S(FRETURN) * RANGE = XCHAR(START) *RANGE1 GE(STAT,END) :S(RETURN) * START = START + 1 * RANGE = RANGE | XCHAR(START) :(RANGE1) * ERANGE * single byte: 0xxxxxxx values 0..0x7f UTF.T1 = RANGE(0,127) * two-byte leader: 110xxxxx values 0x80..0x7ff UTF.T2 = RANGE(192,223) * three-byte leader: 1110xxxx values 0x800..0xffff UTF.T3 = RANGE(224,239) * NEW in UTF-8??? * four-byte leader: 11110xxx values 0x10000..0x1fffff UTF.T4 = RANGE(240,247) * five-byte leader: 111110xx values 0x200000..0x3ffffff UTF.T5 = RANGE(248,251) * five-byte leader: 1111110x values 0x4000000..0x7fffffff UTF.T6 = RANGE(252,253) * subsequent byte: 10xxxxxx UTF.Tx = RANGE(128,191) * match first byte, then do table lookup (bad entries contain FAIL?) * on LEN(1) $ FIRSTBYTE? UTF.RUNE = FENCE + (UTF.T1 | + UTF.T2 UTF.Tx | + UTF.T3 UTF.Tx UTF.Tx | + UTF.T4 UTF.Tx UTF.Tx UTF.Tx | + UTF.T5 UTF.Tx UTF.Tx UTF.Tx UTF.Tx | + UTF.T6 UTF.Tx UTF.Tx UTF.Tx UTF.Tx UTF.Tx) UTF.RUNE0 = POS(0) UTF.RUNE * pattern to move to nexy sync char; * UTF.SYNC = BREAKX(UTF.T1 UTF.T2 UTF.T3 UTF.T4 UTF.T5 UTF.T6) **************** * UTF analog of ARB UTFARB = ARBNO(UTF.RUNE) **************** * UTF analog of LEN DEFINE("UTFLEN(N)") :(EUTFLEN) UTFLEN LT(N,1) :(FRETURN) UTFLEN = UTF.RUNE UTFLEN2 N = N - 1 EQ(N,0) :S(RETURN) UTFLEN = UTFLEN UTF.RUNE :(UTFLEN2) EUTFLEN **************** * UTF analog for ANY(STR) DEFINE("UTFANY(STR)R") :(EUTFANY) UTFANY STR UTF.RUNE0 . UTFANY = :F(FRETURN) UTFANY2 IDENT(STR) :S(RETURN) STR UTF.RUNE0 . R = :F(FRETURN) UTFANY = UTFANY | R :(UTFANY2) EUTFANY **************** * UTF analog for CHAR(N) DEFINE("RUNE(N)") :(ERUNE) RUNE LT(N,0) :S(FRETURN) GT(N,127) :S(RUNE2) * single byte rune; RUNE = XCHAR(N) :(RETURN) RUNE2 GT(N,2047) :S(RUNE3) * two-byte rune; RUNE = XCHAR(192 + (N / 64)) + XCHAR(128 + REMDR(N, 64)) :(RETURN) * three-byte rune; RUNE3 GT(N,65535) :S(RUNE4) RUNE = XCHAR(224 + N / 4096) + XCHAR(128 + REMDR(N / 64, 64)) + XCHAR(128 + REMDR(N, 64)) :(RETURN) * four-byte rune; RUNE4 GT(N,2097151) :S(RUNE5) RUNE = XCHAR(240 + N / 262144) + XCHAR(128 + REMDR(N / 65536, 64)) + XCHAR(128 + REMDR(N / 64, 64)) + XCHAR(128 + REMDR(N, 64)) :(RETURN) * five-byte rune; RUNE5 GT(N,67108863) :S(RUNE6) RUNE = XCHAR(248 + N / 16777216) + XCHAR(128 + REMDR(N / 262144, 64)) + XCHAR(128 + REMDR(N / 4096, 64)) + XCHAR(128 + REMDR(N / 64, 64)) + XCHAR(128 + REMDR(N, 64)) :(RETURN) * six-byte rune; RUNE6 GT(N,2147483647) :S(FRETURN) RUNE = XCHAR(252 + N / 1073741824) + XCHAR(128 + REMDR(N / 16777216, 64)) + XCHAR(128 + REMDR(N / 262144, 64)) + XCHAR(128 + REMDR(N / 4096, 64)) + XCHAR(128 + REMDR(N / 64, 64)) + XCHAR(128 + REMDR(N, 64)) :(RETURN) ERUNE **************** * UTF analog for SIZE(STR) DEFINE("UTFSIZE(STR)") :(EUTFSIZE) UTFSIZE STR UTF.RUNE = :F(UTFSIZ2) UTFSIZE = UTFSIZE + 1 :(UTFSIZE) UTFSIZ2 IDENT(STR) :S(RETURN)F(FRETURN) EUTFSIZE **************** * UTF analog for REPLACE() DEFINE("UTFREPLACE(IN,FROM,TO)T,R1,R2") :(EUTFREPLACE) UTFREPLACE T = TABLE() * peel runes off input and output languages one at a time UTFREP1 FROM UTF.RUNE0 . R1 = :F(UTFREP2) TO UTF.RUNE0 . R2 = :F(UTFREP2) T = R2 :(UTFREP1) * both input and output languages should now be empty UTFREP2 DIFFER(FROM) :S(FRETURN) DIFFER(TO) :S(FRETURN) * freeze table for SNOBOL4+? * peel runes input one at a time, feed thru table UTFREPLACE = UTFREP3 IN UTF.RUNE0 . R1 = :F(UTFREP4) UTFREPLACE = UTFREPLACE (IDENT(T) R1, T) :(UTFREP3) UTFREP4 IDENT(IN) :S(RETURN)F(FRETURN) EUTFREPLACE **************************************************************** * tests * * &ANCHOR = 1 * S = "Hello World!!" * * OUTPUT = UTFREPLACE(S,&UCASE "!",&LCASE "?") * *L S UTF.RUNE $ OUTPUT = :S(L) * * "HELLO WORLD!" UTFARB . OUTPUT RPOS(0) * "Hello World!" UTFARB UTFANY(&LCASE) . OUTPUT * "Hello World!" UTFANY(&UCASE) . OUTPUT * * OUTPUT = RUNE(0) * OUTPUT = RUNE(127) * OUTPUT = '-------' * OUTPUT = RUNE(128) * OUTPUT = RUNE(255) * OUTPUT = RUNE(2047) * OUTPUT = '-------' * OUTPUT = RUNE(2048) * OUTPUT = RUNE(65535) * OUTPUT = '-------' * OUTPUT = RUNE(65536) * OUTPUT = RUNE(2097151) * OUTPUT = '-------' * OUTPUT = RUNE(2097152) * OUTPUT = RUNE(67108863) * OUTPUT = '-------' * OUTPUT = RUNE(67108864) * OUTPUT = RUNE(2147483647) * *END