defer evaluation of Scan.message, for improved performance in the frequent situation where failure is handled later (e.g. via ||);
1 (* Title: Tools/WWW_Find/unicode_symbols.ML
2 Author: Timothy Bourke, NICTA
4 Parse the ISABELLE_HOME/etc/symbols file.
7 signature UNICODE_SYMBOLS =
9 val symbol_to_unicode : string -> int option
10 val abbrev_to_unicode : string -> int option
11 val unicode_to_symbol : int -> string option
12 val unicode_to_abbrev : int -> string option
13 val utf8_to_symbols : string -> string
14 val utf8 : int list -> string
17 structure UnicodeSymbols : UNICODE_SYMBOLS =
25 Scan.make_lexicon (map Symbol.explode ["code", "font", "abbrev", ":"]);
28 Symbol | AsciiSymbol | Keyword | Name | Code | Space | Comment | EOF;
30 datatype token = Token of token_kind * string * Position.range;
32 fun token kind ss = Token (kind, Symbol_Pos.content ss, Symbol_Pos.range ss);
36 fun mk_eof pos = Token (EOF, "", (pos, Position.none));
38 fun str_of_token (Token (_, s, _)) = s;
40 fun pos_of_token (Token (_, _, (pos, _))) = pos;
42 fun int_of_code (Token (Code, s, _)) = #value (Lexicon.read_xnum s)
43 | int_of_code _ = error "internal error in UnicodeSymbols.int_of_code"
45 fun is_proper (Token (Space, _, _)) = false
46 | is_proper (Token (Comment, _, _)) = false
49 fun is_keyword kw (Token (Keyword, kw', _)) = (kw = kw')
50 | is_keyword _ _ = false;
52 fun is_ascii_sym (Token (AsciiSymbol, _, _)) = true
53 | is_ascii_sym _ = false;
55 fun is_hex_code (Token (Code, _, _)) = true
56 | is_hex_code _ = false;
58 fun is_symbol (Token (Symbol, _, _)) = true
59 | is_symbol _ = false;
61 fun is_name (Token (Name, _, _)) = true
64 fun is_eof (Token (EOF, _, _)) = true
67 fun end_position_of (Token (_, _, (_, epos))) = epos;
69 val is_space = Symbol_Pos.symbol #> (fn s => Symbol.is_blank s andalso s <> "\n");
71 (Scan.many1 is_space @@@ Scan.optional ($$$ "\n") []
73 Scan.many is_space @@@ ($$$ "\n")) >> token Space;
75 val scan_code = Lexicon.scan_hex #>> token Code;
77 val scan_ascii_symbol = Scan.one
78 ((fn x => Symbol.is_ascii x andalso
79 not (Symbol.is_ascii_letter x
80 orelse Symbol.is_ascii_digit x
81 orelse Symbol.is_ascii_blank x)) o Symbol_Pos.symbol)
82 -- Scan.many (not o Symbol.is_ascii_blank o Symbol_Pos.symbol)
83 >> (token AsciiSymbol o op ::);
85 fun not_contains xs c = not (member (op =) (raw_explode xs) (Symbol_Pos.symbol c));
87 $$$ "#" |-- (Scan.many (not_contains "\n") @@@ ($$$ "\n"))
91 (case Symbol.decode s of
93 | Symbol.Ctrl _ => true
99 Scan.one (is_sym o Symbol_Pos.symbol) >> (token Symbol o single) ||
103 Scan.literal keywords >> token Keyword ||
105 Lexicon.scan_id >> token Name;
106 val scan_token = Symbol_Pos.!!! (fn () => "Lexical error") (Scan.bulk scanner);
108 (case Scan.error (Scan.finite Symbol_Pos.stopper scan_token) syms of
111 error ("Lexical error at: " ^ Symbol_Pos.content ss ^
112 Position.str_of (#1 (Symbol_Pos.range ss))))
117 (fn [] => mk_eof Position.none
118 | toks => mk_eof (end_position_of (List.last toks))) is_eof;
124 fun $$$ kw = Scan.one (is_keyword kw) >> str_of_token;
125 val hex_code = Scan.one is_hex_code >> int_of_code;
126 val ascii_sym = Scan.one is_ascii_sym >> str_of_token;
127 val symbol = Scan.one is_symbol >> (fn t => (str_of_token t, pos_of_token t));
128 val name = Scan.one is_name >> str_of_token;
130 val unicode = $$$ "code" -- $$$ ":" |-- hex_code;
131 val font = Scan.option ($$$ "font" -- $$$ ":" |-- name);
132 val abbr = Scan.option ($$$ "abbrev" -- $$$ ":"
133 |-- (ascii_sym || $$$ ":" || name));
137 val line = (symbol -- unicode --| font -- abbr) >> Parse.triple1;
141 local (* build tables *)
143 fun add_entries ((fromsym, fromabbr, tosym, toabbr), ((sym, pos), uni, oabbr)) =
146 (Symtab.update_new (sym, uni) fromsym,
148 Inttab.update (uni, sym) tosym,
151 (Symtab.update_new (sym, uni) fromsym,
152 Symtab.update_new (abbr, uni) fromabbr,
153 Inttab.update (uni, sym) tosym,
154 Inttab.update (uni, abbr) toabbr))
155 handle Symtab.DUP sym => error ("Duplicate at" ^ Position.str_of pos)
156 | Inttab.DUP sym => error ("Duplicate code at" ^ Position.str_of pos);
160 fun read_symbols path =
163 Symbol_Pos.explode (File.read path, Path.position path)
166 |> Scan.finite stopper (Scan.repeat line)
169 Library.foldl add_entries
170 ((Symtab.empty, Symtab.empty, Inttab.empty, Inttab.empty),
177 val (fromsym, fromabbr, tosym, toabbr) = read_symbols (Path.explode "~~/etc/symbols");
179 val symbol_to_unicode = Symtab.lookup fromsym;
180 val abbrev_to_unicode = Symtab.lookup fromabbr;
181 val unicode_to_symbol = Inttab.lookup tosym;
182 val unicode_to_abbrev = Inttab.lookup toabbr;
185 fun utf8_to_symbols utf8str =
189 #> Option.map (apfst Byte.charToByte);
190 val wstr = String.str o Byte.byteToChar;
191 val replacement_char = "\<questiondown>";
193 fun word_to_symbol w =
194 (case (unicode_to_symbol o Word32.toInt) w of
198 fun andb32 (w1, w2) =
203 fun read_next (ss, 0, c) = (word_to_symbol c, ss)
204 | read_next (ss, n, c) =
206 NONE => (replacement_char, ss)
208 if Word8.andb (w, 0wxc0) <> 0wx80
209 then (replacement_char, ss')
212 val w' = (Word8.andb (w, 0wx3f));
213 val bw = (Word32.fromLarge o Word8.toLarge) w';
214 val c' = Word32.<< (c, 0wx6);
215 in read_next (ss', n - 1, Word32.orb (c', bw)) end);
217 fun do_char (w, ss) =
218 if Word8.andb (w, 0wx80) = 0wx00
220 else if Word8.andb (w, 0wx60) = 0wx40
221 then read_next (ss, 1, andb32 (w, 0wx1f))
222 else if Word8.andb (w, 0wxf0) = 0wxe0
223 then read_next (ss, 2, andb32 (w, 0wx0f))
224 else if Word8.andb (w, 0wxf8) = 0wxf0
225 then read_next (ss, 3, andb32 (w, 0wx07))
226 else (replacement_char, ss);
229 (case Option.map do_char (get_next ss) of
230 NONE => (implode o rev) rs
231 | SOME (s, ss') => read (s::rs, ss'));
232 in read ([], Substring.full utf8str) end;
238 Word32.>> (w, Word.fromInt (n * 6))
239 |> (curry Word32.andb) 0wx3f
240 |> (curry Word32.orb) 0wx80
241 |> Word8.fromLargeWord o Word32.toLargeWord);
245 Word32.>> (w, Word.fromInt (n * 6))
246 |> (curry Word32.andb) (Word32.>> (0wx000000ff, Word.fromInt (n + 2)))
247 |> (curry Word32.orb) (Word32.<< (0wxffffffff, Word.fromInt (7 - n)))
248 |> Word8.fromLargeWord o Word32.toLargeWord);
250 fun to_utf8_bytes i =
252 then [Word8.fromInt i]
254 val w = Word32.fromInt i;
257 then [stamp 1 w, consec 0 w]
259 then [stamp 2 w, consec 1 w, consec 0 w]
261 then [stamp 3 w, consec 2 w, consec 1 w, consec 0 w]
270 |> Word8Vector.fromList
271 |> Byte.bytesToString;