1 /* Title: Pure/General/symbol.scala
4 Detecting and recoding Isabelle symbols.
10 import scala.collection.mutable
11 import scala.util.matching.Regex
21 private val static_spaces = space * 4000
23 def spaces(k: Int): String =
26 if (k < static_spaces.length) static_spaces.substring(0, k)
33 private val plain = new Regex("""(?xs)
34 [^\r\\\ud800-\udfff] | [\ud800-\udbff][\udc00-\udfff] """)
36 private val physical_newline = new Regex("""(?xs) \n | \r\n | \r """)
38 private val symbol = new Regex("""(?xs)
40 \^? [A-Za-z][A-Za-z0-9_']* |
41 \^raw: [\x20-\x7e\u0100-\uffff && [^.>]]* ) >""")
43 // FIXME cover bad surrogates!?
44 // FIXME check wrt. Isabelle/ML version
45 private val bad_symbol = new Regex("(?xs) (?!" + symbol + ")" +
46 """ \\ < (?: (?! \s | [\"`\\] | \(\* | \*\) | \{\* | \*\} ) . )*""")
49 val regex = new Regex(plain + "|" + physical_newline + "|" + symbol + "|" + bad_symbol + "| .")
54 def is_plain(c: Char): Boolean = !(c == '\r' || c == '\\' || '\ud800' <= c && c <= '\udfff')
56 def is_physical_newline(s: CharSequence): Boolean =
57 "\n".contentEquals(s) || "\r".contentEquals(s) || "\r\n".contentEquals(s)
59 def is_wellformed(s: CharSequence): Boolean =
60 s.length == 1 && is_plain(s.charAt(0)) || !bad_symbol.pattern.matcher(s).matches
62 class Matcher(text: CharSequence)
64 private val matcher = regex.pattern.matcher(text)
65 def apply(start: Int, end: Int): Int =
67 require(0 <= start && start < end && end <= text.length)
68 if (is_plain(text.charAt(start))) 1
70 matcher.region(start, end).lookingAt
79 def iterator(text: CharSequence) = new Iterator[CharSequence]
81 private val matcher = new Matcher(text)
83 def hasNext = i < text.length
86 val n = matcher(i, text.length)
87 val s = text.subSequence(i, i + n)
94 /* decoding offsets */
96 class Index(text: CharSequence)
98 case class Entry(chr: Int, sym: Int)
99 val index: Array[Entry] =
101 val matcher = new Matcher(text)
102 val buf = new mutable.ArrayBuffer[Entry]
105 while (chr < text.length) {
106 val n = matcher(chr, text.length)
109 if (n > 1) buf += Entry(chr, sym)
113 def decode(sym1: Int): Int =
116 val end = index.length
117 def bisect(a: Int, b: Int): Int =
121 if (sym < index(c).sym) bisect(a, c)
122 else if (c + 1 == end || sym < index(c + 1).sym) c
123 else bisect(c + 1, b)
127 val i = bisect(0, end)
129 else index(i).chr + sym - index(i).sym
131 def decode(range: Text.Range): Text.Range = range.map(decode(_))
137 private class Recoder(list: List[(String, String)])
139 private val (min, max) =
143 for ((x, _) <- list) {
152 var tab = Map[String, String]()
153 for ((x, y) <- list) {
155 case None => tab += (x -> y)
157 error("Duplicate mapping of \"" + x + "\" to \"" + y + "\" vs. \"" + z + "\"")
162 def recode(text: String): String =
164 val len = text.length
165 val matcher = regex.pattern.matcher(text)
166 val result = new StringBuilder(len)
170 if (min <= c && c <= max) {
171 matcher.region(i, len).lookingAt
172 val x = matcher.group
173 result.append(table.get(x) getOrElse x)
176 else { result.append(c); i += 1 }
184 /** Symbol interpretation **/
186 class Interpretation(symbol_decls: List[String])
190 private val empty = new Regex("""(?xs) ^\s* (?: \#.* )? $ """)
191 private val key = new Regex("""(?xs) (.+): """)
193 private def read_decl(decl: String): (String, Map[String, String]) =
195 def err() = error("Bad symbol declaration: " + decl)
197 def read_props(props: List[String]): Map[String, String] =
201 case _ :: Nil => err()
202 case key(x) :: y :: rest => read_props(rest) + (x -> y)
206 decl.split("\\s+").toList match {
207 case sym :: props if sym.length > 1 && is_wellformed(sym) => (sym, read_props(props))
212 private val symbols: List[(String, Map[String, String])] =
214 for (decl <- symbol_decls if !empty.pattern.matcher(decl).matches)
215 yield read_decl(decl)): _*) toList
218 /* misc properties */
220 val names: Map[String, String] =
222 val name = new Regex("""\\<([A-Za-z][A-Za-z0-9_']*)>""")
223 Map((for ((sym @ name(a), _) <- symbols) yield (sym -> a)): _*)
226 val abbrevs: Map[String, String] =
228 for ((sym, props) <- symbols if props.isDefinedAt("abbrev"))
229 yield (sym -> props("abbrev"))): _*)
232 /* main recoder methods */
234 private val (decoder, encoder) =
238 (sym, props) <- symbols
240 try { Integer.decode(props("code")).intValue }
242 case _: NoSuchElementException => error("Missing code for symbol " + sym)
243 case _: NumberFormatException => error("Bad code for symbol " + sym)
245 val ch = new String(Character.toChars(code))
247 if (code < 128) error("Illegal ASCII code for symbol " + sym)
250 (new Recoder(mapping),
251 new Recoder(mapping map { case (x, y) => (y, x) }))
254 def decode(text: String): String = decoder.recode(text)
255 def encode(text: String): String = encoder.recode(text)
260 private object Decode_Set
262 def apply(elems: String*): Set[String] =
264 val content = elems.toList
265 Set((content ::: content.map(decode)): _*)
269 private val letters = Decode_Set(
270 "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M",
271 "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z",
272 "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m",
273 "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z",
275 "\\<A>", "\\<B>", "\\<C>", "\\<D>", "\\<E>", "\\<F>", "\\<G>",
276 "\\<H>", "\\<I>", "\\<J>", "\\<K>", "\\<L>", "\\<M>", "\\<N>",
277 "\\<O>", "\\<P>", "\\<Q>", "\\<R>", "\\<S>", "\\<T>", "\\<U>",
278 "\\<V>", "\\<W>", "\\<X>", "\\<Y>", "\\<Z>", "\\<a>", "\\<b>",
279 "\\<c>", "\\<d>", "\\<e>", "\\<f>", "\\<g>", "\\<h>", "\\<i>",
280 "\\<j>", "\\<k>", "\\<l>", "\\<m>", "\\<n>", "\\<o>", "\\<p>",
281 "\\<q>", "\\<r>", "\\<s>", "\\<t>", "\\<u>", "\\<v>", "\\<w>",
282 "\\<x>", "\\<y>", "\\<z>",
284 "\\<AA>", "\\<BB>", "\\<CC>", "\\<DD>", "\\<EE>", "\\<FF>",
285 "\\<GG>", "\\<HH>", "\\<II>", "\\<JJ>", "\\<KK>", "\\<LL>",
286 "\\<MM>", "\\<NN>", "\\<OO>", "\\<PP>", "\\<QQ>", "\\<RR>",
287 "\\<SS>", "\\<TT>", "\\<UU>", "\\<VV>", "\\<WW>", "\\<XX>",
288 "\\<YY>", "\\<ZZ>", "\\<aa>", "\\<bb>", "\\<cc>", "\\<dd>",
289 "\\<ee>", "\\<ff>", "\\<gg>", "\\<hh>", "\\<ii>", "\\<jj>",
290 "\\<kk>", "\\<ll>", "\\<mm>", "\\<nn>", "\\<oo>", "\\<pp>",
291 "\\<qq>", "\\<rr>", "\\<ss>", "\\<tt>", "\\<uu>", "\\<vv>",
292 "\\<ww>", "\\<xx>", "\\<yy>", "\\<zz>",
294 "\\<alpha>", "\\<beta>", "\\<gamma>", "\\<delta>", "\\<epsilon>",
295 "\\<zeta>", "\\<eta>", "\\<theta>", "\\<iota>", "\\<kappa>",
296 "\\<mu>", "\\<nu>", "\\<xi>", "\\<pi>", "\\<rho>", "\\<sigma>",
297 "\\<tau>", "\\<upsilon>", "\\<phi>", "\\<chi>", "\\<psi>",
298 "\\<omega>", "\\<Gamma>", "\\<Delta>", "\\<Theta>", "\\<Lambda>",
299 "\\<Xi>", "\\<Pi>", "\\<Sigma>", "\\<Upsilon>", "\\<Phi>",
300 "\\<Psi>", "\\<Omega>",
302 "\\<^isub>", "\\<^isup>")
305 Decode_Set(space, "\t", "\n", "\u000B", "\f", "\r", "\\<spacespace>", "\\<^newline>")
307 private val sym_chars =
308 Set("!", "#", "$", "%", "&", "*", "+", "-", "/", "<", "=", ">", "?", "@", "^", "_", "|", "~")
310 def is_letter(sym: String): Boolean = letters.contains(sym)
311 def is_digit(sym: String): Boolean = sym.length == 1 && '0' <= sym(0) && sym(0) <= '9'
312 def is_quasi(sym: String): Boolean = sym == "_" || sym == "'"
313 def is_letdig(sym: String): Boolean = is_letter(sym) || is_digit(sym) || is_quasi(sym)
314 def is_blank(sym: String): Boolean = blanks.contains(sym)
315 def is_symbolic_char(sym: String): Boolean = sym_chars.contains(sym)
316 def is_symbolic(sym: String): Boolean = sym.startsWith("\\<") && !sym.startsWith("\\<^")