1 /* Title: Pure/General/symbol.scala
4 Detecting and recoding Isabelle symbols.
10 import scala.collection.mutable
11 import scala.util.matching.Regex
21 private val static_spaces = space * 4000
23 def spaces(k: Int): String =
26 if (k < static_spaces.length) static_spaces.substring(0, k)
31 /* ASCII characters */
33 def is_ascii_letter(c: Char): Boolean = 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z'
34 def is_ascii_digit(c: Char): Boolean = '0' <= c && c <= '9'
35 def is_ascii_quasi(c: Char): Boolean = c == '_' || c == '\''
37 def is_ascii_letdig(c: Char): Boolean =
38 is_ascii_letter(c) || is_ascii_digit(c) || is_ascii_quasi(c)
40 def is_ascii_identifier(s: String): Boolean =
41 s.length > 0 && is_ascii_letter(s(0)) && s.substring(1).forall(is_ascii_letdig)
46 private val plain = new Regex("""(?xs)
47 [^\r\\\ud800-\udfff\ufffd] | [\ud800-\udbff][\udc00-\udfff] """)
49 private val physical_newline = new Regex("""(?xs) \n | \r\n | \r """)
51 private val symbol = new Regex("""(?xs)
53 \^? [A-Za-z][A-Za-z0-9_']* |
54 \^raw: [\x20-\x7e\u0100-\uffff && [^.>]]* ) >""")
56 private val malformed_symbol = new Regex("(?xs) (?!" + symbol + ")" +
57 """ [\ud800-\udbff\ufffd] | \\<\^? """)
60 new Regex(plain + "|" + physical_newline + "|" + symbol + "|" + malformed_symbol + "| .")
65 def is_plain(c: Char): Boolean = !(c == '\r' || c == '\\' || '\ud800' <= c && c <= '\udfff')
67 def is_physical_newline(s: CharSequence): Boolean =
68 "\n".contentEquals(s) || "\r".contentEquals(s) || "\r\n".contentEquals(s)
70 def is_malformed(s: CharSequence): Boolean =
71 !(s.length == 1 && is_plain(s.charAt(0))) && malformed_symbol.pattern.matcher(s).matches
73 class Matcher(text: CharSequence)
75 private val matcher = regex_total.pattern.matcher(text)
76 def apply(start: Int, end: Int): Int =
78 require(0 <= start && start < end && end <= text.length)
79 if (is_plain(text.charAt(start))) 1
81 matcher.region(start, end).lookingAt
88 /* efficient iterators */
90 def iterator(text: CharSequence): Iterator[CharSequence] =
91 new Iterator[CharSequence]
93 private val matcher = new Matcher(text)
95 def hasNext = i < text.length
98 val n = matcher(i, text.length)
99 val s = text.subSequence(i, i + n)
105 private val char_symbols: Array[String] =
106 (0 to 127).iterator.map(i => new String(Array(i.toChar))).toArray
108 private def make_string(sym: CharSequence): String =
112 val c = sym.charAt(0)
113 if (c < char_symbols.length) char_symbols(c)
115 case _ => sym.toString
118 def iterator_string(text: CharSequence): Iterator[String] =
119 iterator(text).map(make_string)
122 /* decoding offsets */
124 class Index(text: CharSequence)
126 case class Entry(chr: Int, sym: Int)
127 val index: Array[Entry] =
129 val matcher = new Matcher(text)
130 val buf = new mutable.ArrayBuffer[Entry]
133 while (chr < text.length) {
134 val n = matcher(chr, text.length)
137 if (n > 1) buf += Entry(chr, sym)
141 def decode(sym1: Int): Int =
144 val end = index.length
145 def bisect(a: Int, b: Int): Int =
149 if (sym < index(c).sym) bisect(a, c)
150 else if (c + 1 == end || sym < index(c + 1).sym) c
151 else bisect(c + 1, b)
155 val i = bisect(0, end)
157 else index(i).chr + sym - index(i).sym
159 def decode(range: Text.Range): Text.Range = range.map(decode(_))
165 private class Recoder(list: List[(String, String)])
167 private val (min, max) =
171 for ((x, _) <- list) {
180 var tab = Map[String, String]()
181 for ((x, y) <- list) {
183 case None => tab += (x -> y)
185 error("Duplicate mapping of \"" + x + "\" to \"" + y + "\" vs. \"" + z + "\"")
190 def recode(text: String): String =
192 val len = text.length
193 val matcher = regex_total.pattern.matcher(text)
194 val result = new StringBuilder(len)
198 if (min <= c && c <= max) {
199 matcher.region(i, len).lookingAt
200 val x = matcher.group
201 result.append(table.get(x) getOrElse x)
204 else { result.append(c); i += 1 }
212 /** Symbol interpretation **/
214 class Interpretation(symbol_decls: List[String])
218 private val empty = new Regex("""(?xs) ^\s* (?: \#.* )? $ """)
219 private val key = new Regex("""(?xs) (.+): """)
221 private def read_decl(decl: String): (String, Map[String, String]) =
223 def err() = error("Bad symbol declaration: " + decl)
225 def read_props(props: List[String]): Map[String, String] =
229 case _ :: Nil => err()
230 case key(x) :: y :: rest => read_props(rest) + (x -> y)
234 decl.split("\\s+").toList match {
235 case sym :: props if sym.length > 1 && !is_malformed(sym) => (sym, read_props(props))
240 private val symbols: List[(String, Map[String, String])] =
242 for (decl <- symbol_decls if !empty.pattern.matcher(decl).matches)
243 yield read_decl(decl)): _*) toList
246 /* misc properties */
248 val names: Map[String, String] =
250 val name = new Regex("""\\<\^?([A-Za-z][A-Za-z0-9_']*)>""")
251 Map((for ((sym @ name(a), _) <- symbols) yield (sym -> a)): _*)
254 val abbrevs: Map[String, String] =
256 for ((sym, props) <- symbols if props.isDefinedAt("abbrev"))
257 yield (sym -> props("abbrev"))): _*)
262 val fonts: Map[String, String] =
264 for ((sym, props) <- symbols if props.isDefinedAt("font"))
265 yield (sym -> props("font"))): _*)
267 val font_names: List[String] = Set(fonts.toList.map(_._2): _*).toList
268 val font_index: Map[String, Int] = Map((font_names zip (0 until font_names.length).toList): _*)
270 def lookup_font(sym: String): Option[Int] = fonts.get(sym).map(font_index(_))
273 /* main recoder methods */
275 private val (decoder, encoder) =
279 (sym, props) <- symbols
281 try { Integer.decode(props("code")).intValue }
283 case _: NoSuchElementException => error("Missing code for symbol " + sym)
284 case _: NumberFormatException => error("Bad code for symbol " + sym)
286 val ch = new String(Character.toChars(code))
288 if (code < 128) error("Illegal ASCII code for symbol " + sym)
291 (new Recoder(mapping),
292 new Recoder(mapping map { case (x, y) => (y, x) }))
295 def decode(text: String): String = decoder.recode(text)
296 def encode(text: String): String = encoder.recode(text)
301 private object Decode_Set
303 def apply(elems: String*): Set[String] =
305 val content = elems.toList
306 Set((content ::: content.map(decode)): _*)
310 private val letters = Decode_Set(
311 "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M",
312 "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z",
313 "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m",
314 "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z",
316 "\\<A>", "\\<B>", "\\<C>", "\\<D>", "\\<E>", "\\<F>", "\\<G>",
317 "\\<H>", "\\<I>", "\\<J>", "\\<K>", "\\<L>", "\\<M>", "\\<N>",
318 "\\<O>", "\\<P>", "\\<Q>", "\\<R>", "\\<S>", "\\<T>", "\\<U>",
319 "\\<V>", "\\<W>", "\\<X>", "\\<Y>", "\\<Z>", "\\<a>", "\\<b>",
320 "\\<c>", "\\<d>", "\\<e>", "\\<f>", "\\<g>", "\\<h>", "\\<i>",
321 "\\<j>", "\\<k>", "\\<l>", "\\<m>", "\\<n>", "\\<o>", "\\<p>",
322 "\\<q>", "\\<r>", "\\<s>", "\\<t>", "\\<u>", "\\<v>", "\\<w>",
323 "\\<x>", "\\<y>", "\\<z>",
325 "\\<AA>", "\\<BB>", "\\<CC>", "\\<DD>", "\\<EE>", "\\<FF>",
326 "\\<GG>", "\\<HH>", "\\<II>", "\\<JJ>", "\\<KK>", "\\<LL>",
327 "\\<MM>", "\\<NN>", "\\<OO>", "\\<PP>", "\\<QQ>", "\\<RR>",
328 "\\<SS>", "\\<TT>", "\\<UU>", "\\<VV>", "\\<WW>", "\\<XX>",
329 "\\<YY>", "\\<ZZ>", "\\<aa>", "\\<bb>", "\\<cc>", "\\<dd>",
330 "\\<ee>", "\\<ff>", "\\<gg>", "\\<hh>", "\\<ii>", "\\<jj>",
331 "\\<kk>", "\\<ll>", "\\<mm>", "\\<nn>", "\\<oo>", "\\<pp>",
332 "\\<qq>", "\\<rr>", "\\<ss>", "\\<tt>", "\\<uu>", "\\<vv>",
333 "\\<ww>", "\\<xx>", "\\<yy>", "\\<zz>",
335 "\\<alpha>", "\\<beta>", "\\<gamma>", "\\<delta>", "\\<epsilon>",
336 "\\<zeta>", "\\<eta>", "\\<theta>", "\\<iota>", "\\<kappa>",
337 "\\<mu>", "\\<nu>", "\\<xi>", "\\<pi>", "\\<rho>", "\\<sigma>",
338 "\\<tau>", "\\<upsilon>", "\\<phi>", "\\<chi>", "\\<psi>",
339 "\\<omega>", "\\<Gamma>", "\\<Delta>", "\\<Theta>", "\\<Lambda>",
340 "\\<Xi>", "\\<Pi>", "\\<Sigma>", "\\<Upsilon>", "\\<Phi>",
341 "\\<Psi>", "\\<Omega>",
343 "\\<^isub>", "\\<^isup>")
346 Decode_Set(space, "\t", "\n", "\u000B", "\f", "\r", "\\<spacespace>", "\\<^newline>")
348 private val sym_chars =
349 Set("!", "#", "$", "%", "&", "*", "+", "-", "/", "<", "=", ">", "?", "@", "^", "_", "|", "~")
351 def is_letter(sym: String): Boolean = letters.contains(sym)
352 def is_digit(sym: String): Boolean = sym.length == 1 && '0' <= sym(0) && sym(0) <= '9'
353 def is_quasi(sym: String): Boolean = sym == "_" || sym == "'"
354 def is_letdig(sym: String): Boolean = is_letter(sym) || is_digit(sym) || is_quasi(sym)
355 def is_blank(sym: String): Boolean = blanks.contains(sym)
356 def is_symbolic_char(sym: String): Boolean = sym_chars.contains(sym)
357 def is_symbolic(sym: String): Boolean =
358 sym.startsWith("\\<") && sym.endsWith(">") && !sym.startsWith("\\<^")
361 /* control symbols */
363 private val ctrl_decoded: Set[String] =
364 Set((for ((sym, _) <- symbols if sym.startsWith("\\<^")) yield decode(sym)): _*)
366 def is_ctrl(sym: String): Boolean =
367 sym.startsWith("\\<^") || ctrl_decoded.contains(sym)
369 def is_controllable(sym: String): Boolean =
370 !is_blank(sym) && !is_ctrl(sym) && !is_malformed(sym)
372 private val subscript_decoded = Set(decode("\\<^sub>"), decode("\\<^isub>"))
373 private val superscript_decoded = Set(decode("\\<^sup>"), decode("\\<^isup>"))
374 val bold_decoded = decode("\\<^bold>")
376 def is_subscript_decoded(sym: String): Boolean = subscript_decoded.contains(sym)
377 def is_superscript_decoded(sym: String): Boolean = superscript_decoded.contains(sym)
378 def is_bold_decoded(sym: String): Boolean = sym == bold_decoded