wenzelm@6118
|
1 |
(* Title: Pure/General/symbol.ML
|
wenzelm@6116
|
2 |
ID: $Id$
|
wenzelm@6116
|
3 |
Author: Markus Wenzel, TU Muenchen
|
wenzelm@6116
|
4 |
|
paulson@21897
|
5 |
Generalized characters with infinitely many named symbols.
|
wenzelm@6116
|
6 |
*)
|
wenzelm@6116
|
7 |
|
wenzelm@6116
|
8 |
signature SYMBOL =
|
wenzelm@6116
|
9 |
sig
|
wenzelm@6116
|
10 |
type symbol
|
wenzelm@26524
|
11 |
val SOH: symbol
|
wenzelm@26524
|
12 |
val STX: symbol
|
wenzelm@26538
|
13 |
val ENQ: symbol
|
wenzelm@26538
|
14 |
val ACK: symbol
|
wenzelm@26524
|
15 |
val DEL: symbol
|
wenzelm@6116
|
16 |
val space: symbol
|
wenzelm@23618
|
17 |
val spaces: int -> string
|
wenzelm@14678
|
18 |
val is_char: symbol -> bool
|
wenzelm@14678
|
19 |
val is_symbolic: symbol -> bool
|
wenzelm@14678
|
20 |
val is_printable: symbol -> bool
|
wenzelm@26632
|
21 |
val is_utf8_trailer: symbol -> bool
|
wenzelm@14678
|
22 |
val eof: symbol
|
wenzelm@14678
|
23 |
val is_eof: symbol -> bool
|
wenzelm@27766
|
24 |
val not_eof: symbol -> bool
|
wenzelm@27732
|
25 |
val stopper: symbol Scan.stopper
|
wenzelm@6857
|
26 |
val sync: symbol
|
wenzelm@6857
|
27 |
val is_sync: symbol -> bool
|
wenzelm@10747
|
28 |
val malformed: symbol
|
wenzelm@23676
|
29 |
val end_malformed: symbol
|
wenzelm@25641
|
30 |
val separate_chars: string -> string
|
wenzelm@23784
|
31 |
val is_regular: symbol -> bool
|
wenzelm@6116
|
32 |
val is_ascii: symbol -> bool
|
wenzelm@14678
|
33 |
val is_ascii_letter: symbol -> bool
|
wenzelm@14678
|
34 |
val is_ascii_digit: symbol -> bool
|
wenzelm@24580
|
35 |
val is_ascii_hex: symbol -> bool
|
wenzelm@14678
|
36 |
val is_ascii_quasi: symbol -> bool
|
wenzelm@14678
|
37 |
val is_ascii_blank: symbol -> bool
|
wenzelm@20200
|
38 |
val is_ascii_lower: symbol -> bool
|
wenzelm@20200
|
39 |
val is_ascii_upper: symbol -> bool
|
wenzelm@20200
|
40 |
val to_ascii_lower: symbol -> symbol
|
wenzelm@20200
|
41 |
val to_ascii_upper: symbol -> symbol
|
wenzelm@14834
|
42 |
val is_raw: symbol -> bool
|
wenzelm@14834
|
43 |
val decode_raw: symbol -> string
|
wenzelm@14977
|
44 |
val encode_raw: string -> string
|
wenzelm@14873
|
45 |
datatype sym = Char of string | Sym of string | Ctrl of string | Raw of string
|
wenzelm@14873
|
46 |
val decode: symbol -> sym
|
wenzelm@14678
|
47 |
datatype kind = Letter | Digit | Quasi | Blank | Other
|
wenzelm@14678
|
48 |
val kind: symbol -> kind
|
wenzelm@6116
|
49 |
val is_letter: symbol -> bool
|
wenzelm@6116
|
50 |
val is_digit: symbol -> bool
|
wenzelm@12904
|
51 |
val is_quasi: symbol -> bool
|
wenzelm@14678
|
52 |
val is_blank: symbol -> bool
|
wenzelm@6116
|
53 |
val is_quasi_letter: symbol -> bool
|
wenzelm@6116
|
54 |
val is_letdig: symbol -> bool
|
wenzelm@16138
|
55 |
val is_ident: symbol list -> bool
|
wenzelm@14728
|
56 |
val beginning: int -> symbol list -> string
|
wenzelm@14678
|
57 |
val scanner: string -> (string list -> 'a * string list) -> symbol list -> 'a
|
berghofe@13730
|
58 |
val scan_id: string list -> string * string list
|
wenzelm@27835
|
59 |
val source: {do_recover: bool} -> (string, 'a) Source.source ->
|
wenzelm@6116
|
60 |
(symbol, (string, 'a) Source.source) Source.source
|
wenzelm@6272
|
61 |
val explode: string -> symbol list
|
wenzelm@14977
|
62 |
val escape: string -> string
|
wenzelm@14678
|
63 |
val strip_blanks: string -> string
|
wenzelm@14678
|
64 |
val bump_init: string -> string
|
wenzelm@12904
|
65 |
val bump_string: string -> string
|
wenzelm@14678
|
66 |
val length: symbol list -> int
|
wenzelm@6692
|
67 |
val xsymbolsN: string
|
wenzelm@6116
|
68 |
end;
|
wenzelm@6116
|
69 |
|
wenzelm@6116
|
70 |
structure Symbol: SYMBOL =
|
wenzelm@6116
|
71 |
struct
|
wenzelm@6116
|
72 |
|
wenzelm@14678
|
73 |
(** type symbol **)
|
wenzelm@6116
|
74 |
|
wenzelm@14678
|
75 |
(*Symbols, which are considered the smallest entities of any Isabelle
|
wenzelm@14678
|
76 |
string, may be of the following form:
|
wenzelm@6272
|
77 |
|
wenzelm@14834
|
78 |
(1) ASCII symbols: a
|
wenzelm@17823
|
79 |
(2) regular symbols: \<ident>
|
wenzelm@14834
|
80 |
(3) control symbols: \<^ident>
|
wenzelm@14834
|
81 |
(4) raw control symbols: \<^raw:...>, where "..." may be any printable
|
wenzelm@20205
|
82 |
character (excluding ".", ">"), or \<^raw000>
|
wenzelm@6272
|
83 |
|
wenzelm@14678
|
84 |
Output is subject to the print_mode variable (default: verbatim),
|
wenzelm@14678
|
85 |
actual interpretation in display is up to front-end tools.
|
wenzelm@6272
|
86 |
*)
|
wenzelm@6272
|
87 |
|
wenzelm@6272
|
88 |
type symbol = string;
|
wenzelm@6272
|
89 |
|
wenzelm@26524
|
90 |
val SOH = chr 1;
|
wenzelm@26524
|
91 |
val STX = chr 2;
|
wenzelm@26538
|
92 |
val ENQ = chr 5;
|
wenzelm@26538
|
93 |
val ACK = chr 6;
|
wenzelm@26524
|
94 |
val DEL = chr 127;
|
wenzelm@26524
|
95 |
|
wenzelm@26524
|
96 |
val space = chr 32;
|
wenzelm@17063
|
97 |
|
wenzelm@17063
|
98 |
local
|
wenzelm@17063
|
99 |
val small_spaces = Vector.tabulate (65, fn i => Library.replicate_string i space);
|
wenzelm@17063
|
100 |
in
|
wenzelm@17063
|
101 |
fun spaces k =
|
wenzelm@17063
|
102 |
if k < 64 then Vector.sub (small_spaces, k)
|
wenzelm@17063
|
103 |
else Library.replicate_string (k div 64) (Vector.sub (small_spaces, 64)) ^
|
wenzelm@17063
|
104 |
Vector.sub (small_spaces, k mod 64);
|
wenzelm@17063
|
105 |
end;
|
wenzelm@14678
|
106 |
|
wenzelm@14678
|
107 |
fun is_char s = size s = 1;
|
wenzelm@14678
|
108 |
|
wenzelm@14678
|
109 |
fun is_symbolic s =
|
wenzelm@14678
|
110 |
String.isPrefix "\\<" s andalso not (String.isPrefix "\\<^" s);
|
wenzelm@14678
|
111 |
|
wenzelm@14678
|
112 |
fun is_printable s =
|
wenzelm@14678
|
113 |
if is_char s then ord space <= ord s andalso ord s <= ord "~"
|
wenzelm@14678
|
114 |
else not (String.isPrefix "\\<^" s);
|
wenzelm@14678
|
115 |
|
wenzelm@26632
|
116 |
fun is_utf8_trailer s = is_char s andalso 128 <= ord s andalso ord s < 192;
|
wenzelm@26632
|
117 |
|
wenzelm@14678
|
118 |
|
wenzelm@14678
|
119 |
(* input source control *)
|
wenzelm@14678
|
120 |
|
wenzelm@6272
|
121 |
val eof = "";
|
wenzelm@6272
|
122 |
fun is_eof s = s = eof;
|
wenzelm@6272
|
123 |
fun not_eof s = s <> eof;
|
wenzelm@27732
|
124 |
val stopper = Scan.stopper (K eof) is_eof;
|
wenzelm@6272
|
125 |
|
wenzelm@14678
|
126 |
val sync = "\\<^sync>";
|
wenzelm@14678
|
127 |
fun is_sync s = s = sync;
|
wenzelm@14678
|
128 |
|
wenzelm@23676
|
129 |
val malformed = "[[";
|
wenzelm@23676
|
130 |
val end_malformed = "]]";
|
wenzelm@25641
|
131 |
|
wenzelm@25641
|
132 |
val separate_chars = explode #> space_implode space;
|
wenzelm@25641
|
133 |
fun malformed_msg s = "Malformed symbolic character: " ^ quote (separate_chars s);
|
wenzelm@23676
|
134 |
|
wenzelm@23784
|
135 |
fun is_regular s =
|
wenzelm@23784
|
136 |
not_eof s andalso s <> sync andalso s <> malformed andalso s <> end_malformed;
|
wenzelm@14678
|
137 |
|
wenzelm@14678
|
138 |
|
wenzelm@14678
|
139 |
(* ascii symbols *)
|
wenzelm@14678
|
140 |
|
wenzelm@14678
|
141 |
fun is_ascii s = is_char s andalso ord s < 128;
|
wenzelm@14678
|
142 |
|
wenzelm@14678
|
143 |
fun is_ascii_letter s =
|
wenzelm@14678
|
144 |
is_char s andalso
|
wenzelm@14678
|
145 |
(ord "A" <= ord s andalso ord s <= ord "Z" orelse
|
wenzelm@14678
|
146 |
ord "a" <= ord s andalso ord s <= ord "z");
|
wenzelm@14678
|
147 |
|
wenzelm@14678
|
148 |
fun is_ascii_digit s =
|
wenzelm@14678
|
149 |
is_char s andalso ord "0" <= ord s andalso ord s <= ord "9";
|
wenzelm@14678
|
150 |
|
wenzelm@24580
|
151 |
fun is_ascii_hex s =
|
wenzelm@24580
|
152 |
is_char s andalso
|
wenzelm@24580
|
153 |
(ord "0" <= ord s andalso ord s <= ord "9" orelse
|
wenzelm@24580
|
154 |
ord "A" <= ord s andalso ord s <= ord "F" orelse
|
wenzelm@24580
|
155 |
ord "a" <= ord s andalso ord s <= ord "f");
|
wenzelm@24580
|
156 |
|
wenzelm@14678
|
157 |
fun is_ascii_quasi "_" = true
|
wenzelm@14678
|
158 |
| is_ascii_quasi "'" = true
|
wenzelm@14678
|
159 |
| is_ascii_quasi _ = false;
|
wenzelm@14678
|
160 |
|
wenzelm@14678
|
161 |
val is_ascii_blank =
|
wenzelm@24580
|
162 |
fn " " => true | "\t" => true | "\n" => true | "\^K" => true | "\^L" => true | "\^M" => true
|
wenzelm@14678
|
163 |
| _ => false;
|
wenzelm@14678
|
164 |
|
wenzelm@20200
|
165 |
fun is_ascii_lower s = is_char s andalso (ord "a" <= ord s andalso ord s <= ord "z");
|
wenzelm@20200
|
166 |
fun is_ascii_upper s = is_char s andalso (ord "A" <= ord s andalso ord s <= ord "Z");
|
wenzelm@20200
|
167 |
|
wenzelm@20200
|
168 |
fun to_ascii_lower s = if is_ascii_upper s then chr (ord s + ord "a" - ord "A") else s;
|
wenzelm@20200
|
169 |
fun to_ascii_upper s = if is_ascii_lower s then chr (ord s + ord "A" - ord "a") else s;
|
wenzelm@20200
|
170 |
|
wenzelm@14678
|
171 |
|
wenzelm@14956
|
172 |
(* encode_raw *)
|
wenzelm@14956
|
173 |
|
wenzelm@20205
|
174 |
fun raw_chr c =
|
wenzelm@20205
|
175 |
ord space <= ord c andalso ord c <= ord "~" andalso c <> "." andalso c <> ">"
|
wenzelm@17823
|
176 |
orelse ord c >= 128;
|
wenzelm@14956
|
177 |
|
wenzelm@14956
|
178 |
fun encode_raw str =
|
wenzelm@14956
|
179 |
let
|
wenzelm@23676
|
180 |
val raw0 = enclose "\\<^raw:" ">";
|
wenzelm@23676
|
181 |
val raw1 = raw0 o implode;
|
wenzelm@14956
|
182 |
val raw2 = enclose "\\<^raw" ">" o string_of_int o ord;
|
wenzelm@14956
|
183 |
|
wenzelm@14956
|
184 |
fun encode cs = enc (Library.take_prefix raw_chr cs)
|
wenzelm@14956
|
185 |
and enc ([], []) = []
|
wenzelm@14956
|
186 |
| enc (cs, []) = [raw1 cs]
|
wenzelm@14956
|
187 |
| enc ([], d :: ds) = raw2 d :: encode ds
|
wenzelm@14956
|
188 |
| enc (cs, d :: ds) = raw1 cs :: raw2 d :: encode ds;
|
wenzelm@14956
|
189 |
in
|
wenzelm@14977
|
190 |
if exists_string (not o raw_chr) str then implode (encode (explode str))
|
wenzelm@23676
|
191 |
else raw0 str
|
wenzelm@14956
|
192 |
end;
|
wenzelm@14956
|
193 |
|
wenzelm@14956
|
194 |
|
wenzelm@14956
|
195 |
(* diagnostics *)
|
wenzelm@14956
|
196 |
|
wenzelm@14956
|
197 |
fun beginning n cs =
|
wenzelm@14956
|
198 |
let
|
wenzelm@14956
|
199 |
val drop_blanks = #1 o Library.take_suffix is_ascii_blank;
|
wenzelm@14956
|
200 |
val all_cs = drop_blanks cs;
|
wenzelm@14956
|
201 |
val dots = if length all_cs > n then " ..." else "";
|
wenzelm@14956
|
202 |
in
|
wenzelm@14956
|
203 |
(drop_blanks (Library.take (n, all_cs))
|
wenzelm@14956
|
204 |
|> map (fn c => if is_ascii_blank c then space else c)
|
wenzelm@14956
|
205 |
|> implode) ^ dots
|
wenzelm@14956
|
206 |
end;
|
wenzelm@14956
|
207 |
|
wenzelm@14956
|
208 |
|
wenzelm@14956
|
209 |
(* decode_raw *)
|
wenzelm@14834
|
210 |
|
wenzelm@14834
|
211 |
fun is_raw s =
|
wenzelm@17063
|
212 |
String.isPrefix "\\<^raw" s andalso String.isSuffix ">" s;
|
wenzelm@14834
|
213 |
|
wenzelm@14834
|
214 |
fun decode_raw s =
|
wenzelm@23676
|
215 |
if not (is_raw s) then error (malformed_msg s)
|
wenzelm@14834
|
216 |
else if String.isPrefix "\\<^raw:" s then String.substring (s, 7, size s - 8)
|
wenzelm@14834
|
217 |
else chr (#1 (Library.read_int (explode (String.substring (s, 6, size s - 7)))));
|
wenzelm@14834
|
218 |
|
wenzelm@14834
|
219 |
|
wenzelm@14873
|
220 |
(* symbol variants *)
|
wenzelm@14873
|
221 |
|
wenzelm@14873
|
222 |
datatype sym = Char of string | Sym of string | Ctrl of string | Raw of string;
|
wenzelm@14873
|
223 |
|
wenzelm@14873
|
224 |
fun decode s =
|
wenzelm@14873
|
225 |
if is_char s then Char s
|
wenzelm@14873
|
226 |
else if is_raw s then Raw (decode_raw s)
|
wenzelm@14873
|
227 |
else if String.isPrefix "\\<^" s then Ctrl (String.substring (s, 3, size s - 4))
|
wenzelm@14873
|
228 |
else if String.isPrefix "\\<" s then Sym (String.substring (s, 2, size s - 3))
|
wenzelm@23676
|
229 |
else error (malformed_msg s);
|
wenzelm@14873
|
230 |
|
wenzelm@14873
|
231 |
|
wenzelm@14678
|
232 |
(* standard symbol kinds *)
|
wenzelm@14678
|
233 |
|
wenzelm@14678
|
234 |
datatype kind = Letter | Digit | Quasi | Blank | Other;
|
wenzelm@6272
|
235 |
|
skalberg@14171
|
236 |
local
|
wenzelm@14678
|
237 |
val symbol_kinds = Symtab.make
|
wenzelm@14678
|
238 |
[("\\<A>", Letter),
|
wenzelm@14678
|
239 |
("\\<B>", Letter),
|
wenzelm@14678
|
240 |
("\\<C>", Letter),
|
wenzelm@14678
|
241 |
("\\<D>", Letter),
|
wenzelm@14678
|
242 |
("\\<E>", Letter),
|
wenzelm@14678
|
243 |
("\\<F>", Letter),
|
wenzelm@14678
|
244 |
("\\<G>", Letter),
|
wenzelm@14678
|
245 |
("\\<H>", Letter),
|
wenzelm@14678
|
246 |
("\\<I>", Letter),
|
wenzelm@14678
|
247 |
("\\<J>", Letter),
|
wenzelm@14678
|
248 |
("\\<K>", Letter),
|
wenzelm@14678
|
249 |
("\\<L>", Letter),
|
wenzelm@14678
|
250 |
("\\<M>", Letter),
|
wenzelm@14678
|
251 |
("\\<N>", Letter),
|
wenzelm@14678
|
252 |
("\\<O>", Letter),
|
wenzelm@14678
|
253 |
("\\<P>", Letter),
|
wenzelm@14678
|
254 |
("\\<Q>", Letter),
|
wenzelm@14678
|
255 |
("\\<R>", Letter),
|
wenzelm@14678
|
256 |
("\\<S>", Letter),
|
wenzelm@14678
|
257 |
("\\<T>", Letter),
|
wenzelm@14678
|
258 |
("\\<U>", Letter),
|
wenzelm@14678
|
259 |
("\\<V>", Letter),
|
wenzelm@14678
|
260 |
("\\<W>", Letter),
|
wenzelm@14678
|
261 |
("\\<X>", Letter),
|
wenzelm@14678
|
262 |
("\\<Y>", Letter),
|
wenzelm@14678
|
263 |
("\\<Z>", Letter),
|
wenzelm@14678
|
264 |
("\\<a>", Letter),
|
wenzelm@14678
|
265 |
("\\<b>", Letter),
|
wenzelm@14678
|
266 |
("\\<c>", Letter),
|
wenzelm@14678
|
267 |
("\\<d>", Letter),
|
wenzelm@14678
|
268 |
("\\<e>", Letter),
|
wenzelm@14678
|
269 |
("\\<f>", Letter),
|
wenzelm@14678
|
270 |
("\\<g>", Letter),
|
wenzelm@14678
|
271 |
("\\<h>", Letter),
|
wenzelm@14678
|
272 |
("\\<i>", Letter),
|
wenzelm@14678
|
273 |
("\\<j>", Letter),
|
wenzelm@14678
|
274 |
("\\<k>", Letter),
|
wenzelm@14678
|
275 |
("\\<l>", Letter),
|
wenzelm@14678
|
276 |
("\\<m>", Letter),
|
wenzelm@14678
|
277 |
("\\<n>", Letter),
|
wenzelm@14678
|
278 |
("\\<o>", Letter),
|
wenzelm@14678
|
279 |
("\\<p>", Letter),
|
wenzelm@14678
|
280 |
("\\<q>", Letter),
|
wenzelm@14678
|
281 |
("\\<r>", Letter),
|
wenzelm@14678
|
282 |
("\\<s>", Letter),
|
wenzelm@14678
|
283 |
("\\<t>", Letter),
|
wenzelm@14678
|
284 |
("\\<u>", Letter),
|
wenzelm@14678
|
285 |
("\\<v>", Letter),
|
wenzelm@14678
|
286 |
("\\<w>", Letter),
|
wenzelm@14678
|
287 |
("\\<x>", Letter),
|
wenzelm@14678
|
288 |
("\\<y>", Letter),
|
wenzelm@14678
|
289 |
("\\<z>", Letter),
|
wenzelm@14678
|
290 |
("\\<AA>", Letter),
|
wenzelm@14678
|
291 |
("\\<BB>", Letter),
|
wenzelm@14678
|
292 |
("\\<CC>", Letter),
|
wenzelm@14678
|
293 |
("\\<DD>", Letter),
|
wenzelm@14678
|
294 |
("\\<EE>", Letter),
|
wenzelm@14678
|
295 |
("\\<FF>", Letter),
|
wenzelm@14678
|
296 |
("\\<GG>", Letter),
|
wenzelm@14678
|
297 |
("\\<HH>", Letter),
|
wenzelm@14678
|
298 |
("\\<II>", Letter),
|
wenzelm@14678
|
299 |
("\\<JJ>", Letter),
|
wenzelm@14678
|
300 |
("\\<KK>", Letter),
|
wenzelm@14678
|
301 |
("\\<LL>", Letter),
|
wenzelm@14678
|
302 |
("\\<MM>", Letter),
|
wenzelm@14678
|
303 |
("\\<NN>", Letter),
|
wenzelm@14678
|
304 |
("\\<OO>", Letter),
|
wenzelm@14678
|
305 |
("\\<PP>", Letter),
|
wenzelm@14678
|
306 |
("\\<QQ>", Letter),
|
wenzelm@14678
|
307 |
("\\<RR>", Letter),
|
wenzelm@14678
|
308 |
("\\<SS>", Letter),
|
wenzelm@14678
|
309 |
("\\<TT>", Letter),
|
wenzelm@14678
|
310 |
("\\<UU>", Letter),
|
wenzelm@14678
|
311 |
("\\<VV>", Letter),
|
wenzelm@14678
|
312 |
("\\<WW>", Letter),
|
wenzelm@14678
|
313 |
("\\<XX>", Letter),
|
wenzelm@14678
|
314 |
("\\<YY>", Letter),
|
wenzelm@14678
|
315 |
("\\<ZZ>", Letter),
|
wenzelm@14678
|
316 |
("\\<aa>", Letter),
|
wenzelm@14678
|
317 |
("\\<bb>", Letter),
|
wenzelm@14678
|
318 |
("\\<cc>", Letter),
|
wenzelm@14678
|
319 |
("\\<dd>", Letter),
|
wenzelm@14678
|
320 |
("\\<ee>", Letter),
|
wenzelm@14678
|
321 |
("\\<ff>", Letter),
|
wenzelm@14678
|
322 |
("\\<gg>", Letter),
|
wenzelm@14678
|
323 |
("\\<hh>", Letter),
|
wenzelm@14678
|
324 |
("\\<ii>", Letter),
|
wenzelm@14678
|
325 |
("\\<jj>", Letter),
|
wenzelm@14678
|
326 |
("\\<kk>", Letter),
|
wenzelm@14678
|
327 |
("\\<ll>", Letter),
|
wenzelm@14678
|
328 |
("\\<mm>", Letter),
|
wenzelm@14678
|
329 |
("\\<nn>", Letter),
|
wenzelm@14678
|
330 |
("\\<oo>", Letter),
|
wenzelm@14678
|
331 |
("\\<pp>", Letter),
|
wenzelm@14678
|
332 |
("\\<qq>", Letter),
|
wenzelm@14678
|
333 |
("\\<rr>", Letter),
|
wenzelm@14678
|
334 |
("\\<ss>", Letter),
|
wenzelm@14678
|
335 |
("\\<tt>", Letter),
|
wenzelm@14678
|
336 |
("\\<uu>", Letter),
|
wenzelm@14678
|
337 |
("\\<vv>", Letter),
|
wenzelm@14678
|
338 |
("\\<ww>", Letter),
|
wenzelm@14678
|
339 |
("\\<xx>", Letter),
|
wenzelm@14678
|
340 |
("\\<yy>", Letter),
|
wenzelm@14678
|
341 |
("\\<zz>", Letter),
|
wenzelm@14678
|
342 |
("\\<alpha>", Letter),
|
wenzelm@14678
|
343 |
("\\<beta>", Letter),
|
wenzelm@14678
|
344 |
("\\<gamma>", Letter),
|
wenzelm@14678
|
345 |
("\\<delta>", Letter),
|
wenzelm@14678
|
346 |
("\\<epsilon>", Letter),
|
wenzelm@14678
|
347 |
("\\<zeta>", Letter),
|
wenzelm@14678
|
348 |
("\\<eta>", Letter),
|
wenzelm@14678
|
349 |
("\\<theta>", Letter),
|
wenzelm@14678
|
350 |
("\\<iota>", Letter),
|
wenzelm@14678
|
351 |
("\\<kappa>", Letter),
|
wenzelm@14678
|
352 |
("\\<lambda>", Other), (*sic!*)
|
wenzelm@14678
|
353 |
("\\<mu>", Letter),
|
wenzelm@14678
|
354 |
("\\<nu>", Letter),
|
wenzelm@14678
|
355 |
("\\<xi>", Letter),
|
wenzelm@14678
|
356 |
("\\<pi>", Letter),
|
wenzelm@14678
|
357 |
("\\<rho>", Letter),
|
wenzelm@14678
|
358 |
("\\<sigma>", Letter),
|
wenzelm@14678
|
359 |
("\\<tau>", Letter),
|
wenzelm@14678
|
360 |
("\\<upsilon>", Letter),
|
wenzelm@14678
|
361 |
("\\<phi>", Letter),
|
wenzelm@25521
|
362 |
("\\<chi>", Letter),
|
wenzelm@14678
|
363 |
("\\<psi>", Letter),
|
wenzelm@14678
|
364 |
("\\<omega>", Letter),
|
wenzelm@14678
|
365 |
("\\<Gamma>", Letter),
|
wenzelm@14678
|
366 |
("\\<Delta>", Letter),
|
wenzelm@14678
|
367 |
("\\<Theta>", Letter),
|
wenzelm@14678
|
368 |
("\\<Lambda>", Letter),
|
wenzelm@14678
|
369 |
("\\<Xi>", Letter),
|
wenzelm@14678
|
370 |
("\\<Pi>", Letter),
|
wenzelm@14678
|
371 |
("\\<Sigma>", Letter),
|
wenzelm@14678
|
372 |
("\\<Upsilon>", Letter),
|
wenzelm@14678
|
373 |
("\\<Phi>", Letter),
|
wenzelm@14678
|
374 |
("\\<Psi>", Letter),
|
wenzelm@14678
|
375 |
("\\<Omega>", Letter),
|
wenzelm@14961
|
376 |
("\\<^isub>", Letter),
|
wenzelm@14961
|
377 |
("\\<^isup>", Letter),
|
wenzelm@14678
|
378 |
("\\<spacespace>", Blank)];
|
wenzelm@14678
|
379 |
in
|
wenzelm@14678
|
380 |
fun kind s =
|
wenzelm@14678
|
381 |
if is_ascii_letter s then Letter
|
wenzelm@14678
|
382 |
else if is_ascii_digit s then Digit
|
wenzelm@14678
|
383 |
else if is_ascii_quasi s then Quasi
|
wenzelm@14678
|
384 |
else if is_ascii_blank s then Blank
|
wenzelm@14678
|
385 |
else if is_char s then Other
|
wenzelm@18939
|
386 |
else the_default Other (Symtab.lookup symbol_kinds s);
|
wenzelm@14678
|
387 |
end;
|
skalberg@14171
|
388 |
|
wenzelm@14678
|
389 |
fun is_letter s = kind s = Letter;
|
wenzelm@14678
|
390 |
fun is_digit s = kind s = Digit;
|
wenzelm@14678
|
391 |
fun is_quasi s = kind s = Quasi;
|
wenzelm@14678
|
392 |
fun is_blank s = kind s = Blank;
|
skalberg@14171
|
393 |
|
wenzelm@14678
|
394 |
fun is_quasi_letter s = let val k = kind s in k = Letter orelse k = Quasi end;
|
wenzelm@14678
|
395 |
fun is_letdig s = let val k = kind s in k = Letter orelse k = Digit orelse k = Quasi end;
|
skalberg@14171
|
396 |
|
wenzelm@16138
|
397 |
fun is_ident [] = false
|
wenzelm@16138
|
398 |
| is_ident (c :: cs) = is_letter c andalso forall is_letdig cs;
|
wenzelm@16138
|
399 |
|
skalberg@14171
|
400 |
|
skalberg@14171
|
401 |
|
wenzelm@14678
|
402 |
(** symbol input **)
|
skalberg@14171
|
403 |
|
wenzelm@14678
|
404 |
(* scanning through symbols *)
|
wenzelm@6272
|
405 |
|
wenzelm@6640
|
406 |
fun scanner msg scan chs =
|
wenzelm@6640
|
407 |
let
|
skalberg@15531
|
408 |
fun message (cs, NONE) = msg ^ ": " ^ quote (beginning 10 cs)
|
skalberg@15531
|
409 |
| message (cs, SOME msg') = msg ^ ", " ^ msg' ^ ": " ^ quote (beginning 10 cs);
|
wenzelm@14961
|
410 |
val fin_scan = Scan.error (Scan.finite stopper (!! message scan));
|
wenzelm@6640
|
411 |
in
|
wenzelm@6640
|
412 |
(case fin_scan chs of
|
wenzelm@6640
|
413 |
(result, []) => result
|
skalberg@15531
|
414 |
| (_, rest) => error (message (rest, NONE)))
|
wenzelm@6640
|
415 |
end;
|
wenzelm@6640
|
416 |
|
wenzelm@21858
|
417 |
val scan_id = Scan.one is_letter ^^ (Scan.many is_letdig >> implode);
|
schirmer@14561
|
418 |
|
wenzelm@23676
|
419 |
|
wenzelm@23676
|
420 |
(* source *)
|
wenzelm@23676
|
421 |
|
wenzelm@14678
|
422 |
local
|
wenzelm@14678
|
423 |
|
wenzelm@23676
|
424 |
fun is_plain s = s <> "\^M" andalso s <> "\\" andalso not_eof s;
|
wenzelm@23676
|
425 |
|
wenzelm@14678
|
426 |
val scan_encoded_newline =
|
wenzelm@17756
|
427 |
$$ "\^M" -- $$ "\n" >> K "\n" ||
|
wenzelm@17756
|
428 |
$$ "\^M" >> K "\n" ||
|
wenzelm@14956
|
429 |
$$ "\\" -- Scan.optional ($$ "\\") "" -- Scan.this_string "<^newline>" >> K "\n";
|
wenzelm@14956
|
430 |
|
wenzelm@14956
|
431 |
val scan_raw =
|
wenzelm@21858
|
432 |
Scan.this_string "raw:" ^^ (Scan.many raw_chr >> implode) ||
|
wenzelm@21858
|
433 |
Scan.this_string "raw" ^^ (Scan.many1 is_ascii_digit >> implode);
|
wenzelm@14678
|
434 |
|
wenzelm@6116
|
435 |
val scan =
|
wenzelm@23676
|
436 |
Scan.one is_plain ||
|
wenzelm@14678
|
437 |
scan_encoded_newline ||
|
wenzelm@14956
|
438 |
(($$ "\\" --| Scan.optional ($$ "\\") "") ^^ $$ "<" ^^
|
wenzelm@23676
|
439 |
!! (fn (cs, _) => malformed_msg (beginning 10 ("\\" :: "<" :: cs)))
|
wenzelm@14956
|
440 |
(($$ "^" ^^ (scan_raw || scan_id) || scan_id) ^^ $$ ">")) ||
|
wenzelm@6116
|
441 |
Scan.one not_eof;
|
wenzelm@6116
|
442 |
|
wenzelm@27745
|
443 |
val scan_resync =
|
wenzelm@27745
|
444 |
Scan.one is_ascii_blank || $$ "\"" || $$ "`" || $$ "\\" ||
|
wenzelm@27745
|
445 |
Scan.this_string "(*" || Scan.this_string "*)" ||
|
wenzelm@27745
|
446 |
Scan.this_string "{*" || Scan.this_string "*}";
|
wenzelm@27745
|
447 |
|
wenzelm@23676
|
448 |
val recover =
|
wenzelm@27903
|
449 |
(Scan.this (explode "\\\\<") || Scan.this (explode "\\<")) @@@
|
wenzelm@27745
|
450 |
Scan.repeat (Scan.unless scan_resync (Scan.one not_eof))
|
wenzelm@27745
|
451 |
>> (fn ss => malformed :: ss @ [end_malformed]);
|
wenzelm@23676
|
452 |
|
wenzelm@23676
|
453 |
in
|
wenzelm@23676
|
454 |
|
wenzelm@27835
|
455 |
fun source {do_recover} src =
|
wenzelm@23682
|
456 |
Source.source stopper (Scan.bulk scan)
|
wenzelm@23682
|
457 |
(if do_recover then SOME (false, K recover) else NONE) src;
|
wenzelm@23676
|
458 |
|
wenzelm@14678
|
459 |
end;
|
wenzelm@14678
|
460 |
|
wenzelm@14562
|
461 |
|
wenzelm@23676
|
462 |
(* explode *)
|
wenzelm@6116
|
463 |
|
wenzelm@23676
|
464 |
local
|
wenzelm@6116
|
465 |
|
wenzelm@14562
|
466 |
fun no_explode [] = true
|
wenzelm@14562
|
467 |
| no_explode ("\\" :: "<" :: _) = false
|
wenzelm@17756
|
468 |
| no_explode ("\^M" :: _) = false
|
wenzelm@14562
|
469 |
| no_explode (_ :: cs) = no_explode cs;
|
wenzelm@6116
|
470 |
|
wenzelm@23676
|
471 |
in
|
wenzelm@23676
|
472 |
|
wenzelm@6116
|
473 |
fun sym_explode str =
|
wenzelm@6116
|
474 |
let val chs = explode str in
|
wenzelm@14562
|
475 |
if no_explode chs then chs
|
wenzelm@27835
|
476 |
else Source.exhaust (source {do_recover = false} (Source.of_list chs))
|
wenzelm@6116
|
477 |
end;
|
wenzelm@6116
|
478 |
|
wenzelm@23676
|
479 |
end;
|
wenzelm@14994
|
480 |
|
wenzelm@6116
|
481 |
|
wenzelm@14977
|
482 |
(* escape *)
|
wenzelm@14977
|
483 |
|
wenzelm@14977
|
484 |
val escape = implode o map (fn s => if is_char s then s else "\\" ^ s) o sym_explode;
|
wenzelm@14977
|
485 |
|
wenzelm@14977
|
486 |
|
wenzelm@14678
|
487 |
(* blanks *)
|
wenzelm@14678
|
488 |
|
wenzelm@14678
|
489 |
fun strip_blanks s =
|
wenzelm@14678
|
490 |
sym_explode s
|
wenzelm@14678
|
491 |
|> Library.take_prefix is_blank |> #2
|
wenzelm@14678
|
492 |
|> Library.take_suffix is_blank |> #1
|
wenzelm@14678
|
493 |
|> implode;
|
wenzelm@14678
|
494 |
|
wenzelm@14678
|
495 |
|
wenzelm@14678
|
496 |
(* bump string -- treat as base 26 or base 1 numbers *)
|
wenzelm@14678
|
497 |
|
wenzelm@15979
|
498 |
fun symbolic_end (_ :: "\\<^isub>" :: _) = true
|
wenzelm@15979
|
499 |
| symbolic_end (_ :: "\\<^isup>" :: _) = true
|
wenzelm@14908
|
500 |
| symbolic_end (s :: _) = is_symbolic s
|
wenzelm@14908
|
501 |
| symbolic_end [] = false;
|
wenzelm@14678
|
502 |
|
wenzelm@14678
|
503 |
fun bump_init str =
|
wenzelm@14908
|
504 |
if symbolic_end (rev (sym_explode str)) then str ^ "'"
|
wenzelm@14678
|
505 |
else str ^ "a";
|
wenzelm@12904
|
506 |
|
wenzelm@12904
|
507 |
fun bump_string str =
|
wenzelm@12904
|
508 |
let
|
wenzelm@12904
|
509 |
fun bump [] = ["a"]
|
wenzelm@12904
|
510 |
| bump ("z" :: ss) = "a" :: bump ss
|
wenzelm@12904
|
511 |
| bump (s :: ss) =
|
wenzelm@14678
|
512 |
if is_char s andalso ord "a" <= ord s andalso ord s < ord "z"
|
wenzelm@12904
|
513 |
then chr (ord s + 1) :: ss
|
wenzelm@12904
|
514 |
else "a" :: s :: ss;
|
wenzelm@14678
|
515 |
|
wenzelm@14678
|
516 |
val (ss, qs) = apfst rev (Library.take_suffix is_quasi (sym_explode str));
|
wenzelm@14908
|
517 |
val ss' = if symbolic_end ss then "'" :: ss else bump ss;
|
wenzelm@14678
|
518 |
in implode (rev ss' @ qs) end;
|
wenzelm@14678
|
519 |
|
wenzelm@12904
|
520 |
|
wenzelm@6272
|
521 |
|
wenzelm@23618
|
522 |
(** xsymbols **)
|
wenzelm@14977
|
523 |
|
wenzelm@14977
|
524 |
val xsymbolsN = "xsymbols";
|
wenzelm@6272
|
525 |
|
wenzelm@14678
|
526 |
fun sym_len s =
|
wenzelm@24593
|
527 |
if not (is_printable s) then (0: int)
|
wenzelm@14678
|
528 |
else if String.isPrefix "\\<long" s then 2
|
wenzelm@14678
|
529 |
else if String.isPrefix "\\<Long" s then 2
|
wenzelm@14678
|
530 |
else if s = "\\<spacespace>" then 2
|
wenzelm@14678
|
531 |
else 1;
|
wenzelm@14678
|
532 |
|
wenzelm@19473
|
533 |
fun sym_length ss = fold (fn s => fn n => sym_len s + n) ss 0;
|
wenzelm@14678
|
534 |
|
wenzelm@6116
|
535 |
(*final declarations of this structure!*)
|
wenzelm@6272
|
536 |
val length = sym_length;
|
wenzelm@6116
|
537 |
val explode = sym_explode;
|
wenzelm@6116
|
538 |
|
wenzelm@6116
|
539 |
end;
|