simplified/robustified treatment of malformed symbols, which are now fully internalized (total Symbol.explode etc.);
allow malformed symbols inside quoted material, comments etc. -- for improved user experience with incremental re-parsing;
refined treatment of malformed surrogates (Scala);
1 /* Title: Pure/General/symbol.scala
4 Detecting and recoding Isabelle symbols.
10 import scala.collection.mutable
11 import scala.util.matching.Regex
21 private val static_spaces = space * 4000
23 def spaces(k: Int): String =
26 if (k < static_spaces.length) static_spaces.substring(0, k)
33 private val plain = new Regex("""(?xs)
34 [^\r\\\ud800-\udfff] | [\ud800-\udbff][\udc00-\udfff] """)
36 private val physical_newline = new Regex("""(?xs) \n | \r\n | \r """)
38 private val symbol = new Regex("""(?xs)
40 \^? [A-Za-z][A-Za-z0-9_']* |
41 \^raw: [\x20-\x7e\u0100-\uffff && [^.>]]* ) >""")
43 private val malformed_symbol = new Regex("(?xs) (?!" + symbol + ")" +
44 """ [\ud800-\udbff] | \\<^? """)
47 new Regex(plain + "|" + physical_newline + "|" + symbol + "|" + malformed_symbol + "| .")
52 def is_plain(c: Char): Boolean = !(c == '\r' || c == '\\' || '\ud800' <= c && c <= '\udfff')
54 def is_physical_newline(s: CharSequence): Boolean =
55 "\n".contentEquals(s) || "\r".contentEquals(s) || "\r\n".contentEquals(s)
57 def is_malformed(s: CharSequence): Boolean =
58 !(s.length == 1 && is_plain(s.charAt(0))) && malformed_symbol.pattern.matcher(s).matches
60 class Matcher(text: CharSequence)
62 private val matcher = regex_total.pattern.matcher(text)
63 def apply(start: Int, end: Int): Int =
65 require(0 <= start && start < end && end <= text.length)
66 if (is_plain(text.charAt(start))) 1
68 matcher.region(start, end).lookingAt
77 def iterator(text: CharSequence) = new Iterator[CharSequence]
79 private val matcher = new Matcher(text)
81 def hasNext = i < text.length
84 val n = matcher(i, text.length)
85 val s = text.subSequence(i, i + n)
92 /* decoding offsets */
94 class Index(text: CharSequence)
96 case class Entry(chr: Int, sym: Int)
97 val index: Array[Entry] =
99 val matcher = new Matcher(text)
100 val buf = new mutable.ArrayBuffer[Entry]
103 while (chr < text.length) {
104 val n = matcher(chr, text.length)
107 if (n > 1) buf += Entry(chr, sym)
111 def decode(sym1: Int): Int =
114 val end = index.length
115 def bisect(a: Int, b: Int): Int =
119 if (sym < index(c).sym) bisect(a, c)
120 else if (c + 1 == end || sym < index(c + 1).sym) c
121 else bisect(c + 1, b)
125 val i = bisect(0, end)
127 else index(i).chr + sym - index(i).sym
129 def decode(range: Text.Range): Text.Range = range.map(decode(_))
135 private class Recoder(list: List[(String, String)])
137 private val (min, max) =
141 for ((x, _) <- list) {
150 var tab = Map[String, String]()
151 for ((x, y) <- list) {
153 case None => tab += (x -> y)
155 error("Duplicate mapping of \"" + x + "\" to \"" + y + "\" vs. \"" + z + "\"")
160 def recode(text: String): String =
162 val len = text.length
163 val matcher = regex_total.pattern.matcher(text)
164 val result = new StringBuilder(len)
168 if (min <= c && c <= max) {
169 matcher.region(i, len).lookingAt
170 val x = matcher.group
171 result.append(table.get(x) getOrElse x)
174 else { result.append(c); i += 1 }
182 /** Symbol interpretation **/
184 class Interpretation(symbol_decls: List[String])
188 private val empty = new Regex("""(?xs) ^\s* (?: \#.* )? $ """)
189 private val key = new Regex("""(?xs) (.+): """)
191 private def read_decl(decl: String): (String, Map[String, String]) =
193 def err() = error("Bad symbol declaration: " + decl)
195 def read_props(props: List[String]): Map[String, String] =
199 case _ :: Nil => err()
200 case key(x) :: y :: rest => read_props(rest) + (x -> y)
204 decl.split("\\s+").toList match {
205 case sym :: props if sym.length > 1 && !is_malformed(sym) => (sym, read_props(props))
210 private val symbols: List[(String, Map[String, String])] =
212 for (decl <- symbol_decls if !empty.pattern.matcher(decl).matches)
213 yield read_decl(decl)): _*) toList
216 /* misc properties */
218 val names: Map[String, String] =
220 val name = new Regex("""\\<([A-Za-z][A-Za-z0-9_']*)>""")
221 Map((for ((sym @ name(a), _) <- symbols) yield (sym -> a)): _*)
224 val abbrevs: Map[String, String] =
226 for ((sym, props) <- symbols if props.isDefinedAt("abbrev"))
227 yield (sym -> props("abbrev"))): _*)
230 /* main recoder methods */
232 private val (decoder, encoder) =
236 (sym, props) <- symbols
238 try { Integer.decode(props("code")).intValue }
240 case _: NoSuchElementException => error("Missing code for symbol " + sym)
241 case _: NumberFormatException => error("Bad code for symbol " + sym)
243 val ch = new String(Character.toChars(code))
245 if (code < 128) error("Illegal ASCII code for symbol " + sym)
248 (new Recoder(mapping),
249 new Recoder(mapping map { case (x, y) => (y, x) }))
252 def decode(text: String): String = decoder.recode(text)
253 def encode(text: String): String = encoder.recode(text)
258 private object Decode_Set
260 def apply(elems: String*): Set[String] =
262 val content = elems.toList
263 Set((content ::: content.map(decode)): _*)
267 private val letters = Decode_Set(
268 "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M",
269 "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z",
270 "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m",
271 "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z",
273 "\\<A>", "\\<B>", "\\<C>", "\\<D>", "\\<E>", "\\<F>", "\\<G>",
274 "\\<H>", "\\<I>", "\\<J>", "\\<K>", "\\<L>", "\\<M>", "\\<N>",
275 "\\<O>", "\\<P>", "\\<Q>", "\\<R>", "\\<S>", "\\<T>", "\\<U>",
276 "\\<V>", "\\<W>", "\\<X>", "\\<Y>", "\\<Z>", "\\<a>", "\\<b>",
277 "\\<c>", "\\<d>", "\\<e>", "\\<f>", "\\<g>", "\\<h>", "\\<i>",
278 "\\<j>", "\\<k>", "\\<l>", "\\<m>", "\\<n>", "\\<o>", "\\<p>",
279 "\\<q>", "\\<r>", "\\<s>", "\\<t>", "\\<u>", "\\<v>", "\\<w>",
280 "\\<x>", "\\<y>", "\\<z>",
282 "\\<AA>", "\\<BB>", "\\<CC>", "\\<DD>", "\\<EE>", "\\<FF>",
283 "\\<GG>", "\\<HH>", "\\<II>", "\\<JJ>", "\\<KK>", "\\<LL>",
284 "\\<MM>", "\\<NN>", "\\<OO>", "\\<PP>", "\\<QQ>", "\\<RR>",
285 "\\<SS>", "\\<TT>", "\\<UU>", "\\<VV>", "\\<WW>", "\\<XX>",
286 "\\<YY>", "\\<ZZ>", "\\<aa>", "\\<bb>", "\\<cc>", "\\<dd>",
287 "\\<ee>", "\\<ff>", "\\<gg>", "\\<hh>", "\\<ii>", "\\<jj>",
288 "\\<kk>", "\\<ll>", "\\<mm>", "\\<nn>", "\\<oo>", "\\<pp>",
289 "\\<qq>", "\\<rr>", "\\<ss>", "\\<tt>", "\\<uu>", "\\<vv>",
290 "\\<ww>", "\\<xx>", "\\<yy>", "\\<zz>",
292 "\\<alpha>", "\\<beta>", "\\<gamma>", "\\<delta>", "\\<epsilon>",
293 "\\<zeta>", "\\<eta>", "\\<theta>", "\\<iota>", "\\<kappa>",
294 "\\<mu>", "\\<nu>", "\\<xi>", "\\<pi>", "\\<rho>", "\\<sigma>",
295 "\\<tau>", "\\<upsilon>", "\\<phi>", "\\<chi>", "\\<psi>",
296 "\\<omega>", "\\<Gamma>", "\\<Delta>", "\\<Theta>", "\\<Lambda>",
297 "\\<Xi>", "\\<Pi>", "\\<Sigma>", "\\<Upsilon>", "\\<Phi>",
298 "\\<Psi>", "\\<Omega>",
300 "\\<^isub>", "\\<^isup>")
303 Decode_Set(space, "\t", "\n", "\u000B", "\f", "\r", "\\<spacespace>", "\\<^newline>")
305 private val sym_chars =
306 Set("!", "#", "$", "%", "&", "*", "+", "-", "/", "<", "=", ">", "?", "@", "^", "_", "|", "~")
308 def is_letter(sym: String): Boolean = letters.contains(sym)
309 def is_digit(sym: String): Boolean = sym.length == 1 && '0' <= sym(0) && sym(0) <= '9'
310 def is_quasi(sym: String): Boolean = sym == "_" || sym == "'"
311 def is_letdig(sym: String): Boolean = is_letter(sym) || is_digit(sym) || is_quasi(sym)
312 def is_blank(sym: String): Boolean = blanks.contains(sym)
313 def is_symbolic_char(sym: String): Boolean = sym_chars.contains(sym)
314 def is_symbolic(sym: String): Boolean =
315 sym.startsWith("\\<") && sym.endsWith(">") && !sym.startsWith("\\<^")