src/Tools/WWW_Find/unicode_symbols.ML
author wenzelm
Sat, 23 Jul 2011 16:37:17 +0200
changeset 44818 9b00f09f7721
parent 44475 8c89a1fb30f2
child 50007 0518bf89c777
permissions -rw-r--r--
defer evaluation of Scan.message, for improved performance in the frequent situation where failure is handled later (e.g. via ||);
     1 (*  Title:      Tools/WWW_Find/unicode_symbols.ML
     2     Author:     Timothy Bourke, NICTA
     3 
     4 Parse the ISABELLE_HOME/etc/symbols file.
     5 *)
     6 
     7 signature UNICODE_SYMBOLS =
     8 sig
     9 val symbol_to_unicode : string -> int option
    10 val abbrev_to_unicode : string -> int option
    11 val unicode_to_symbol : int -> string option
    12 val unicode_to_abbrev : int -> string option
    13 val utf8_to_symbols   : string -> string
    14 val utf8              : int list -> string
    15 end;
    16 
    17 structure UnicodeSymbols : UNICODE_SYMBOLS =
    18 struct
    19 
    20 local (* Lexer *)
    21 
    22 open Basic_Symbol_Pos
    23 
    24 val keywords =
    25   Scan.make_lexicon (map Symbol.explode ["code", "font", "abbrev", ":"]);
    26 
    27 datatype token_kind =
    28   Symbol | AsciiSymbol | Keyword | Name | Code | Space | Comment | EOF;
    29 
    30 datatype token = Token of token_kind * string * Position.range;
    31 
    32 fun token kind ss = Token (kind, Symbol_Pos.content ss, Symbol_Pos.range ss);
    33 
    34 in
    35 
    36 fun mk_eof pos = Token (EOF, "", (pos, Position.none));
    37 
    38 fun str_of_token (Token (_, s, _)) = s;
    39 
    40 fun pos_of_token (Token (_, _, (pos, _))) = pos;
    41 
    42 fun int_of_code (Token (Code, s, _)) = #value (Lexicon.read_xnum s)
    43   | int_of_code _ = error "internal error in UnicodeSymbols.int_of_code"
    44 
    45 fun is_proper (Token (Space, _, _)) = false
    46   | is_proper (Token (Comment, _, _)) = false
    47   | is_proper _ = true;
    48 
    49 fun is_keyword kw (Token (Keyword, kw', _)) = (kw = kw')
    50   | is_keyword _ _ = false;
    51 
    52 fun is_ascii_sym (Token (AsciiSymbol, _, _)) = true
    53   | is_ascii_sym _ = false;
    54 
    55 fun is_hex_code (Token (Code, _, _)) = true
    56   | is_hex_code _ = false;
    57 
    58 fun is_symbol (Token (Symbol, _, _)) = true
    59   | is_symbol _ = false;
    60 
    61 fun is_name (Token (Name, _, _)) = true
    62   | is_name _ = false;
    63 
    64 fun is_eof (Token (EOF, _, _)) = true
    65   | is_eof _ = false;
    66 
    67 fun end_position_of (Token (_, _, (_, epos))) = epos;
    68 
    69 val is_space = Symbol_Pos.symbol #> (fn s => Symbol.is_blank s andalso s <> "\n");
    70 val scan_space =
    71   (Scan.many1 is_space @@@ Scan.optional ($$$ "\n") []
    72    ||
    73    Scan.many is_space @@@ ($$$ "\n")) >> token Space;
    74 
    75 val scan_code = Lexicon.scan_hex #>> token Code;
    76 
    77 val scan_ascii_symbol = Scan.one
    78   ((fn x => Symbol.is_ascii x andalso
    79       not (Symbol.is_ascii_letter x
    80            orelse Symbol.is_ascii_digit x
    81            orelse Symbol.is_ascii_blank x)) o Symbol_Pos.symbol)
    82   -- Scan.many (not o Symbol.is_ascii_blank o Symbol_Pos.symbol)
    83   >> (token AsciiSymbol o op ::);
    84 
    85 fun not_contains xs c = not (member (op =) (raw_explode xs) (Symbol_Pos.symbol c));
    86 val scan_comment =
    87   $$$ "#" |-- (Scan.many (not_contains "\n") @@@ ($$$ "\n"))
    88   >> token Comment;
    89 
    90 fun is_sym s =
    91   (case Symbol.decode s of
    92     Symbol.Sym _ => true
    93   | Symbol.Ctrl _ => true
    94   | _ => false);
    95 
    96 fun tokenize syms =
    97   let
    98     val scanner =
    99       Scan.one (is_sym o Symbol_Pos.symbol) >> (token Symbol o single) ||
   100       scan_comment ||
   101       scan_space ||
   102       scan_code ||
   103       Scan.literal keywords >> token Keyword ||
   104       scan_ascii_symbol ||
   105       Lexicon.scan_id >> token Name;
   106     val scan_token = Symbol_Pos.!!! (fn () => "Lexical error") (Scan.bulk scanner);
   107   in
   108     (case Scan.error (Scan.finite Symbol_Pos.stopper scan_token) syms of
   109       (toks, []) => toks
   110     | (_, ss) =>
   111         error ("Lexical error at: " ^ Symbol_Pos.content ss ^
   112           Position.str_of (#1 (Symbol_Pos.range ss))))
   113   end;
   114 
   115 val stopper =
   116   Scan.stopper
   117     (fn [] => mk_eof Position.none
   118       | toks => mk_eof (end_position_of (List.last toks))) is_eof;
   119 
   120 end;
   121 
   122 local (* Parser *)
   123 
   124 fun $$$ kw = Scan.one (is_keyword kw) >> str_of_token;
   125 val hex_code = Scan.one is_hex_code >> int_of_code;
   126 val ascii_sym = Scan.one is_ascii_sym >> str_of_token;
   127 val symbol = Scan.one is_symbol >> (fn t => (str_of_token t, pos_of_token t));
   128 val name = Scan.one is_name >> str_of_token;
   129 
   130 val unicode = $$$ "code" -- $$$ ":" |-- hex_code;
   131 val font = Scan.option ($$$ "font" -- $$$ ":" |-- name);
   132 val abbr = Scan.option ($$$ "abbrev" -- $$$ ":"
   133                         |-- (ascii_sym || $$$ ":" || name));
   134 
   135 in
   136 
   137 val line = (symbol -- unicode --| font -- abbr) >> Parse.triple1;
   138 
   139 end;
   140 
   141 local (* build tables *)
   142 
   143 fun add_entries ((fromsym, fromabbr, tosym, toabbr), ((sym, pos), uni, oabbr)) =
   144   (case oabbr of
   145      NONE =>
   146        (Symtab.update_new (sym, uni) fromsym,
   147         fromabbr,
   148         Inttab.update (uni, sym) tosym,
   149         toabbr)
   150    | SOME abbr =>
   151        (Symtab.update_new (sym, uni) fromsym,
   152         Symtab.update_new (abbr, uni) fromabbr,
   153         Inttab.update (uni, sym) tosym,
   154         Inttab.update (uni, abbr) toabbr))
   155   handle Symtab.DUP sym => error ("Duplicate at" ^ Position.str_of pos)
   156        | Inttab.DUP sym => error ("Duplicate code at" ^ Position.str_of pos);
   157 
   158 in
   159 
   160 fun read_symbols path =
   161   let
   162     val parsed_lines =
   163       Symbol_Pos.explode (File.read path, Path.position path)
   164       |> tokenize
   165       |> filter is_proper
   166       |> Scan.finite stopper (Scan.repeat line)
   167       |> fst;
   168   in
   169     Library.foldl add_entries
   170       ((Symtab.empty, Symtab.empty, Inttab.empty, Inttab.empty),
   171        parsed_lines)
   172   end;
   173 
   174 end;
   175 
   176 local
   177 val (fromsym, fromabbr, tosym, toabbr) = read_symbols (Path.explode "~~/etc/symbols");
   178 in
   179 val symbol_to_unicode = Symtab.lookup fromsym;
   180 val abbrev_to_unicode = Symtab.lookup fromabbr;
   181 val unicode_to_symbol = Inttab.lookup tosym;
   182 val unicode_to_abbrev = Inttab.lookup toabbr;
   183 end;
   184 
   185 fun utf8_to_symbols utf8str =
   186   let
   187     val get_next =
   188       Substring.getc
   189       #> Option.map (apfst Byte.charToByte);
   190     val wstr = String.str o Byte.byteToChar;
   191     val replacement_char = "\<questiondown>";
   192 
   193     fun word_to_symbol w =
   194       (case (unicode_to_symbol o Word32.toInt) w of
   195          NONE => "?"
   196        | SOME s => s);
   197 
   198     fun andb32 (w1, w2) =
   199       Word8.andb(w1, w2)
   200       |> Word8.toLarge
   201       |> Word32.fromLarge;
   202 
   203     fun read_next (ss, 0, c) = (word_to_symbol c, ss)
   204       | read_next (ss, n, c) =
   205           (case get_next ss of
   206              NONE => (replacement_char, ss)
   207            | SOME (w, ss') =>
   208                if Word8.andb (w, 0wxc0) <> 0wx80
   209                then (replacement_char, ss')
   210                else
   211                  let
   212                    val w' = (Word8.andb (w, 0wx3f));
   213                    val bw = (Word32.fromLarge o Word8.toLarge) w';
   214                    val c' = Word32.<< (c, 0wx6);
   215                  in read_next (ss', n - 1, Word32.orb (c', bw)) end);
   216 
   217     fun do_char (w, ss) =
   218       if Word8.andb (w, 0wx80) = 0wx00
   219       then (wstr w, ss)
   220       else if Word8.andb (w, 0wx60) = 0wx40
   221       then read_next (ss, 1, andb32 (w, 0wx1f))
   222       else if Word8.andb (w, 0wxf0) = 0wxe0
   223       then read_next (ss, 2, andb32 (w, 0wx0f))
   224       else if Word8.andb (w, 0wxf8) = 0wxf0
   225       then read_next (ss, 3, andb32 (w, 0wx07))
   226       else (replacement_char, ss);
   227 
   228     fun read (rs, ss) =
   229       (case Option.map do_char (get_next ss) of
   230          NONE => (implode o rev) rs
   231        | SOME (s, ss') => read (s::rs, ss'));
   232   in read ([], Substring.full utf8str) end;
   233 
   234 local
   235 
   236 fun consec n =
   237   fn w => (
   238     Word32.>> (w, Word.fromInt (n * 6))
   239     |> (curry Word32.andb) 0wx3f
   240     |> (curry Word32.orb) 0wx80
   241     |> Word8.fromLargeWord o Word32.toLargeWord);
   242 
   243 fun stamp n =
   244   fn w => (
   245     Word32.>> (w, Word.fromInt (n * 6))
   246     |> (curry Word32.andb) (Word32.>> (0wx000000ff, Word.fromInt (n + 2)))
   247     |> (curry Word32.orb) (Word32.<< (0wxffffffff, Word.fromInt (7 - n)))
   248     |> Word8.fromLargeWord o Word32.toLargeWord);
   249 
   250 fun to_utf8_bytes i =
   251   if i <= 0x007f
   252   then [Word8.fromInt i]
   253   else let
   254     val w = Word32.fromInt i;
   255   in
   256     if i < 0x07ff
   257     then [stamp 1 w, consec 0 w]
   258     else if i < 0xffff
   259     then [stamp 2 w, consec 1 w, consec 0 w]
   260     else if i < 0x10ffff
   261     then [stamp 3 w, consec 2 w, consec 1 w, consec 0 w]
   262     else []
   263   end;
   264 
   265 in
   266 
   267 fun utf8 is =
   268   map to_utf8_bytes is
   269   |> flat
   270   |> Word8Vector.fromList
   271   |> Byte.bytesToString;
   272 
   273 end
   274 
   275 end;
   276