src/Pure/General/symbol.scala
author wenzelm
Sat, 19 Dec 2009 11:45:14 +0100
changeset 34143 d8d9df8407f6
parent 34104 2b9cdf23c188
child 34146 6cc9a0cbaf55
permissions -rw-r--r--
added symbol classification;
tuned;
     1 /*  Title:      Pure/General/symbol.scala
     2     Author:     Makarius
     3 
     4 Detecting and recoding Isabelle symbols.
     5 */
     6 
     7 package isabelle
     8 
     9 import scala.io.Source
    10 import scala.collection.{jcl, mutable}
    11 import scala.util.matching.Regex
    12 
    13 
    14 object Symbol
    15 {
    16   /* Symbol regexps */
    17 
    18   private val plain = new Regex("""(?xs)
    19     [^\\ \ud800-\udfff] | [\ud800-\udbff][\udc00-\udfff] """)
    20 
    21   private val symbol = new Regex("""(?xs)
    22       \\ < (?:
    23       \^? [A-Za-z][A-Za-z0-9_']* |
    24       \^raw: [\x20-\x7e\u0100-\uffff && [^.>]]* ) >""")
    25 
    26   private val bad_symbol = new Regex("(?xs) (?!" + symbol + ")" +
    27     """ \\ < (?: (?! \s | [\"`\\] | \(\* | \*\) | \{\* | \*\} ) . )*""")
    28 
    29   // total pattern
    30   val regex = new Regex(plain + "|" + symbol + "|" + bad_symbol + "| .")
    31 
    32   // prefix of another symbol
    33   def is_open(s: CharSequence): Boolean =
    34   {
    35     val len = s.length
    36     len == 1 && Character.isHighSurrogate(s.charAt(0)) ||
    37     s == "\\" ||
    38     s == "\\<" ||
    39     len > 2 && s.charAt(len - 1) != '>'   // FIXME bad_symbol !??
    40   }
    41 
    42 
    43   /* elements */
    44 
    45   def could_open(c: Char): Boolean =
    46     c == '\\' || Character.isHighSurrogate(c)
    47 
    48   def elements(text: CharSequence) = new Iterator[String]
    49   {
    50     private val matcher = regex.pattern.matcher(text)
    51     private var i = 0
    52     def hasNext = i < text.length
    53     def next = {
    54       val len =
    55         if (could_open(text.charAt(i))) {
    56           matcher.region(i, text.length).lookingAt
    57           matcher.group.length
    58         }
    59         else 1
    60       val s = text.subSequence(i, i + len)
    61       i += len
    62       s.toString
    63     }
    64   }
    65 
    66 
    67   /* decoding offsets */
    68 
    69   class Index(text: CharSequence)
    70   {
    71     case class Entry(chr: Int, sym: Int)
    72     val index: Array[Entry] =
    73     {
    74       val matcher = regex.pattern.matcher(text)
    75       val buf = new mutable.ArrayBuffer[Entry]
    76       var chr = 0
    77       var sym = 0
    78       while (chr < text.length) {
    79         val len =
    80           if (could_open(text.charAt(chr))) {
    81             matcher.region(chr, text.length).lookingAt
    82             matcher.group.length
    83           }
    84           else 1
    85         chr += len
    86         sym += 1
    87         if (len > 1) buf += Entry(chr, sym)
    88       }
    89       buf.toArray
    90     }
    91     def decode(sym: Int): Int =
    92     {
    93       val end = index.length
    94       def bisect(a: Int, b: Int): Int =
    95       {
    96         if (a < b) {
    97           val c = (a + b) / 2
    98           if (sym < index(c).sym) bisect(a, c)
    99           else if (c + 1 == end || sym < index(c + 1).sym) c
   100           else bisect(c + 1, b)
   101         }
   102         else -1
   103       }
   104       val i = bisect(0, end)
   105       if (i < 0) sym
   106       else index(i).chr + sym - index(i).sym
   107     }
   108   }
   109 
   110 
   111   /* recoding text */
   112 
   113   private class Recoder(list: List[(String, String)])
   114   {
   115     private val (min, max) =
   116     {
   117       var min = '\uffff'
   118       var max = '\u0000'
   119       for ((x, _) <- list) {
   120         val c = x(0)
   121         if (c < min) min = c
   122         if (c > max) max = c
   123       }
   124       (min, max)
   125     }
   126     private val table =
   127     {
   128       val table = new jcl.HashMap[String, String]   // reasonably efficient?
   129       for ((x, y) <- list) table + (x -> y)
   130       table
   131     }
   132     def recode(text: String): String =
   133     {
   134       val len = text.length
   135       val matcher = regex.pattern.matcher(text)
   136       val result = new StringBuilder(len)
   137       var i = 0
   138       while (i < len) {
   139         val c = text(i)
   140         if (min <= c && c <= max) {
   141           matcher.region(i, len).lookingAt
   142           val x = matcher.group
   143           result.append(table.get(x) getOrElse x)
   144           i = matcher.end
   145         }
   146         else { result.append(c); i += 1 }
   147       }
   148       result.toString
   149     }
   150   }
   151 
   152 
   153 
   154   /** Symbol interpretation **/
   155 
   156   class Interpretation(symbol_decls: Iterator[String])
   157   {
   158     /* read symbols */
   159 
   160     private val empty = new Regex("""(?xs) ^\s* (?: \#.* )? $ """)
   161     private val key = new Regex("""(?xs) (.+): """)
   162 
   163     private def read_decl(decl: String): (String, Map[String, String]) =
   164     {
   165       def err() = error("Bad symbol declaration: " + decl)
   166 
   167       def read_props(props: List[String]): Map[String, String] =
   168       {
   169         props match {
   170           case Nil => Map()
   171           case _ :: Nil => err()
   172           case key(x) :: y :: rest => read_props(rest) + (x -> y)
   173           case _ => err()
   174         }
   175       }
   176       decl.split("\\s+").toList match {
   177         case Nil => err()
   178         case sym :: props => (sym, read_props(props))
   179       }
   180     }
   181 
   182     private val symbols: List[(String, Map[String, String])] =
   183       for (decl <- symbol_decls.toList if !empty.pattern.matcher(decl).matches)
   184         yield read_decl(decl)
   185 
   186 
   187     /* misc properties */
   188 
   189     val names: Map[String, String] =
   190     {
   191       val name = new Regex("""\\<([A-Za-z][A-Za-z0-9_']*)>""")
   192       Map((for ((sym @ name(a), _) <- symbols) yield (sym -> a)): _*)
   193     }
   194 
   195     val abbrevs: Map[String, String] = Map((
   196       for ((sym, props) <- symbols if props.isDefinedAt("abbrev"))
   197         yield (sym -> props("abbrev"))): _*)
   198 
   199 
   200     /* main recoder methods */
   201 
   202     private val (decoder, encoder) =
   203     {
   204       val mapping =
   205         for {
   206           (sym, props) <- symbols
   207           val code =
   208             try { Integer.decode(props("code")).intValue }
   209             catch {
   210               case _: NoSuchElementException => error("Missing code for symbol " + sym)
   211               case _: NumberFormatException => error("Bad code for symbol " + sym)
   212             }
   213           val ch = new String(Character.toChars(code))
   214         } yield (sym, ch)
   215       (new Recoder(mapping),
   216        new Recoder(mapping map { case (x, y) => (y, x) }))
   217     }
   218 
   219     def decode(text: String): String = decoder.recode(text)
   220     def encode(text: String): String = encoder.recode(text)
   221 
   222 
   223     /* classification */
   224 
   225     private val raw_letters = Set(
   226       "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M",
   227       "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z",
   228       "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m",
   229       "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z",
   230 
   231       "\\<A>", "\\<B>", "\\<C>", "\\<D>", "\\<E>", "\\<F>", "\\<G>",
   232       "\\<H>", "\\<I>", "\\<J>", "\\<K>", "\\<L>", "\\<M>", "\\<N>",
   233       "\\<O>", "\\<P>", "\\<Q>", "\\<R>", "\\<S>", "\\<T>", "\\<U>",
   234       "\\<V>", "\\<W>", "\\<X>", "\\<Y>", "\\<Z>", "\\<a>", "\\<b>",
   235       "\\<c>", "\\<d>", "\\<e>", "\\<f>", "\\<g>", "\\<h>", "\\<i>",
   236       "\\<j>", "\\<k>", "\\<l>", "\\<m>", "\\<n>", "\\<o>", "\\<p>",
   237       "\\<q>", "\\<r>", "\\<s>", "\\<t>", "\\<u>", "\\<v>", "\\<w>",
   238       "\\<x>", "\\<y>", "\\<z>",
   239 
   240       "\\<AA>", "\\<BB>", "\\<CC>", "\\<DD>", "\\<EE>", "\\<FF>",
   241       "\\<GG>", "\\<HH>", "\\<II>", "\\<JJ>", "\\<KK>", "\\<LL>",
   242       "\\<MM>", "\\<NN>", "\\<OO>", "\\<PP>", "\\<QQ>", "\\<RR>",
   243       "\\<SS>", "\\<TT>", "\\<UU>", "\\<VV>", "\\<WW>", "\\<XX>",
   244       "\\<YY>", "\\<ZZ>", "\\<aa>", "\\<bb>", "\\<cc>", "\\<dd>",
   245       "\\<ee>", "\\<ff>", "\\<gg>", "\\<hh>", "\\<ii>", "\\<jj>",
   246       "\\<kk>", "\\<ll>", "\\<mm>", "\\<nn>", "\\<oo>", "\\<pp>",
   247       "\\<qq>", "\\<rr>", "\\<ss>", "\\<tt>", "\\<uu>", "\\<vv>",
   248       "\\<ww>", "\\<xx>", "\\<yy>", "\\<zz>",
   249 
   250       "\\<alpha>", "\\<beta>", "\\<gamma>", "\\<delta>", "\\<epsilon>",
   251       "\\<zeta>", "\\<eta>", "\\<theta>", "\\<iota>", "\\<kappa>",
   252       "\\<mu>", "\\<nu>", "\\<xi>", "\\<pi>", "\\<rho>", "\\<sigma>",
   253       "\\<tau>", "\\<upsilon>", "\\<phi>", "\\<chi>", "\\<psi>",
   254       "\\<omega>", "\\<Gamma>", "\\<Delta>", "\\<Theta>", "\\<Lambda>",
   255       "\\<Xi>", "\\<Pi>", "\\<Sigma>", "\\<Upsilon>", "\\<Phi>",
   256       "\\<Psi>", "\\<Omega>",
   257 
   258       "\\<^isub>", "\\<^isup>")
   259 
   260     private val letters = raw_letters ++ raw_letters.map(decode(_))
   261 
   262     def is_letter(sym: String): Boolean = letters.contains(sym)
   263 
   264     def is_digit(sym: String): Boolean =
   265       if (sym.length == 1) {
   266         val c = sym(0)
   267         '0' <= c && c <= '9'
   268       }
   269       else false
   270 
   271     def is_quasi(sym: String): Boolean = sym == "_" || sym == "'"
   272 
   273 
   274     private val raw_blanks =
   275       Set(" ", "\t", "\n", "\u000B", "\f", "\r", "\\<spacespace>", "\\<^newline>")
   276     private val blanks = raw_blanks ++ raw_blanks.map(decode(_))
   277 
   278     def is_blank(sym: String): Boolean = blanks.contains(sym)
   279   }
   280 }