1 /* Title: Pure/General/symbol.scala
4 Detecting and recoding Isabelle symbols.
10 import scala.collection.{jcl, mutable}
11 import scala.util.matching.Regex
18 private val plain = new Regex("""(?xs)
19 [^\\ \ud800-\udfff] | [\ud800-\udbff][\udc00-\udfff] """)
21 private val symbol = new Regex("""(?xs)
23 \^? [A-Za-z][A-Za-z0-9_']* |
24 \^raw: [\x20-\x7e\u0100-\uffff && [^.>]]* ) >""")
26 private val bad_symbol = new Regex("(?xs) (?!" + symbol + ")" +
27 """ \\ < (?: (?! \s | [\"`\\] | \(\* | \*\) | \{\* | \*\} ) . )*""")
30 val regex = new Regex(plain + "|" + symbol + "|" + bad_symbol + "| .")
32 // prefix of another symbol
33 def is_open(s: CharSequence): Boolean =
36 len == 1 && Character.isHighSurrogate(s.charAt(0)) ||
39 len > 2 && s.charAt(len - 1) != '>' // FIXME bad_symbol !??
45 def could_open(c: Char): Boolean =
46 c == '\\' || Character.isHighSurrogate(c)
48 def elements(text: CharSequence) = new Iterator[String]
50 private val matcher = regex.pattern.matcher(text)
52 def hasNext = i < text.length
55 if (could_open(text.charAt(i))) {
56 matcher.region(i, text.length).lookingAt
60 val s = text.subSequence(i, i + len)
67 /* decoding offsets */
69 class Index(text: CharSequence)
71 case class Entry(chr: Int, sym: Int)
72 val index: Array[Entry] =
74 val matcher = regex.pattern.matcher(text)
75 val buf = new mutable.ArrayBuffer[Entry]
78 while (chr < text.length) {
80 if (could_open(text.charAt(chr))) {
81 matcher.region(chr, text.length).lookingAt
87 if (len > 1) buf += Entry(chr, sym)
91 def decode(sym: Int): Int =
93 val end = index.length
94 def bisect(a: Int, b: Int): Int =
98 if (sym < index(c).sym) bisect(a, c)
99 else if (c + 1 == end || sym < index(c + 1).sym) c
100 else bisect(c + 1, b)
104 val i = bisect(0, end)
106 else index(i).chr + sym - index(i).sym
113 private class Recoder(list: List[(String, String)])
115 private val (min, max) =
119 for ((x, _) <- list) {
128 val table = new jcl.HashMap[String, String] // reasonably efficient?
129 for ((x, y) <- list) table + (x -> y)
132 def recode(text: String): String =
134 val len = text.length
135 val matcher = regex.pattern.matcher(text)
136 val result = new StringBuilder(len)
140 if (min <= c && c <= max) {
141 matcher.region(i, len).lookingAt
142 val x = matcher.group
143 result.append(table.get(x) getOrElse x)
146 else { result.append(c); i += 1 }
154 /** Symbol interpretation **/
156 class Interpretation(symbol_decls: Iterator[String])
160 private val empty = new Regex("""(?xs) ^\s* (?: \#.* )? $ """)
161 private val key = new Regex("""(?xs) (.+): """)
163 private def read_decl(decl: String): (String, Map[String, String]) =
165 def err() = error("Bad symbol declaration: " + decl)
167 def read_props(props: List[String]): Map[String, String] =
171 case _ :: Nil => err()
172 case key(x) :: y :: rest => read_props(rest) + (x -> y)
176 decl.split("\\s+").toList match {
178 case sym :: props => (sym, read_props(props))
182 private val symbols: List[(String, Map[String, String])] =
183 for (decl <- symbol_decls.toList if !empty.pattern.matcher(decl).matches)
184 yield read_decl(decl)
187 /* misc properties */
189 val names: Map[String, String] =
191 val name = new Regex("""\\<([A-Za-z][A-Za-z0-9_']*)>""")
192 Map((for ((sym @ name(a), _) <- symbols) yield (sym -> a)): _*)
195 val abbrevs: Map[String, String] = Map((
196 for ((sym, props) <- symbols if props.isDefinedAt("abbrev"))
197 yield (sym -> props("abbrev"))): _*)
200 /* main recoder methods */
202 private val (decoder, encoder) =
206 (sym, props) <- symbols
208 try { Integer.decode(props("code")).intValue }
210 case _: NoSuchElementException => error("Missing code for symbol " + sym)
211 case _: NumberFormatException => error("Bad code for symbol " + sym)
213 val ch = new String(Character.toChars(code))
215 (new Recoder(mapping),
216 new Recoder(mapping map { case (x, y) => (y, x) }))
219 def decode(text: String): String = decoder.recode(text)
220 def encode(text: String): String = encoder.recode(text)
225 private val raw_letters = Set(
226 "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M",
227 "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z",
228 "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m",
229 "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z",
231 "\\<A>", "\\<B>", "\\<C>", "\\<D>", "\\<E>", "\\<F>", "\\<G>",
232 "\\<H>", "\\<I>", "\\<J>", "\\<K>", "\\<L>", "\\<M>", "\\<N>",
233 "\\<O>", "\\<P>", "\\<Q>", "\\<R>", "\\<S>", "\\<T>", "\\<U>",
234 "\\<V>", "\\<W>", "\\<X>", "\\<Y>", "\\<Z>", "\\<a>", "\\<b>",
235 "\\<c>", "\\<d>", "\\<e>", "\\<f>", "\\<g>", "\\<h>", "\\<i>",
236 "\\<j>", "\\<k>", "\\<l>", "\\<m>", "\\<n>", "\\<o>", "\\<p>",
237 "\\<q>", "\\<r>", "\\<s>", "\\<t>", "\\<u>", "\\<v>", "\\<w>",
238 "\\<x>", "\\<y>", "\\<z>",
240 "\\<AA>", "\\<BB>", "\\<CC>", "\\<DD>", "\\<EE>", "\\<FF>",
241 "\\<GG>", "\\<HH>", "\\<II>", "\\<JJ>", "\\<KK>", "\\<LL>",
242 "\\<MM>", "\\<NN>", "\\<OO>", "\\<PP>", "\\<QQ>", "\\<RR>",
243 "\\<SS>", "\\<TT>", "\\<UU>", "\\<VV>", "\\<WW>", "\\<XX>",
244 "\\<YY>", "\\<ZZ>", "\\<aa>", "\\<bb>", "\\<cc>", "\\<dd>",
245 "\\<ee>", "\\<ff>", "\\<gg>", "\\<hh>", "\\<ii>", "\\<jj>",
246 "\\<kk>", "\\<ll>", "\\<mm>", "\\<nn>", "\\<oo>", "\\<pp>",
247 "\\<qq>", "\\<rr>", "\\<ss>", "\\<tt>", "\\<uu>", "\\<vv>",
248 "\\<ww>", "\\<xx>", "\\<yy>", "\\<zz>",
250 "\\<alpha>", "\\<beta>", "\\<gamma>", "\\<delta>", "\\<epsilon>",
251 "\\<zeta>", "\\<eta>", "\\<theta>", "\\<iota>", "\\<kappa>",
252 "\\<mu>", "\\<nu>", "\\<xi>", "\\<pi>", "\\<rho>", "\\<sigma>",
253 "\\<tau>", "\\<upsilon>", "\\<phi>", "\\<chi>", "\\<psi>",
254 "\\<omega>", "\\<Gamma>", "\\<Delta>", "\\<Theta>", "\\<Lambda>",
255 "\\<Xi>", "\\<Pi>", "\\<Sigma>", "\\<Upsilon>", "\\<Phi>",
256 "\\<Psi>", "\\<Omega>",
258 "\\<^isub>", "\\<^isup>")
260 private val letters = raw_letters ++ raw_letters.map(decode(_))
262 def is_letter(sym: String): Boolean = letters.contains(sym)
264 def is_digit(sym: String): Boolean =
265 if (sym.length == 1) {
271 def is_quasi(sym: String): Boolean = sym == "_" || sym == "'"
274 private val raw_blanks =
275 Set(" ", "\t", "\n", "\u000B", "\f", "\r", "\\<spacespace>", "\\<^newline>")
276 private val blanks = raw_blanks ++ raw_blanks.map(decode(_))
278 def is_blank(sym: String): Boolean = blanks.contains(sym)