src/Pure/General/symbol.scala
author wenzelm
Sat, 13 Nov 2010 16:46:00 +0100
changeset 40768 37b79d789d91
parent 40690 41c32616298c
child 40769 1050315f6ee2
permissions -rw-r--r--
tuned;
     1 /*  Title:      Pure/General/symbol.scala
     2     Author:     Makarius
     3 
     4 Detecting and recoding Isabelle symbols.
     5 */
     6 
     7 package isabelle
     8 
     9 import scala.io.Source
    10 import scala.collection.mutable
    11 import scala.util.matching.Regex
    12 
    13 
    14 object Symbol
    15 {
    16   /* spaces */
    17 
    18   val spc = ' '
    19   val space = " "
    20 
    21   private val static_spaces = space * 4000
    22 
    23   def spaces(k: Int): String =
    24   {
    25     require(k >= 0)
    26     if (k < static_spaces.length) static_spaces.substring(0, k)
    27     else space * k
    28   }
    29 
    30 
    31   /* Symbol regexps */
    32 
    33   private val plain = new Regex("""(?xs)
    34       [^\r\\\ud800-\udfff] | [\ud800-\udbff][\udc00-\udfff] """)
    35 
    36   private val physical_newline = new Regex("""(?xs) \n | \r\n | \r """)
    37 
    38   private val symbol = new Regex("""(?xs)
    39       \\ < (?:
    40       \^? [A-Za-z][A-Za-z0-9_']* |
    41       \^raw: [\x20-\x7e\u0100-\uffff && [^.>]]* ) >""")
    42 
    43   // FIXME cover bad surrogates!?
    44   // FIXME check wrt. Isabelle/ML version
    45   private val bad_symbol = new Regex("(?xs) (?!" + symbol + ")" +
    46     """ \\ < (?: (?! \s | [\"`\\] | \(\* | \*\) | \{\* | \*\} ) . )*""")
    47 
    48   // total pattern
    49   val regex = new Regex(plain + "|" + physical_newline + "|" + symbol + "|" + bad_symbol + "| .")
    50 
    51 
    52   /* basic matching */
    53 
    54   def is_plain(c: Char): Boolean = !(c == '\r' || c == '\\' || '\ud800' <= c && c <= '\udfff')
    55 
    56   def is_physical_newline(s: CharSequence): Boolean =
    57     "\n".contentEquals(s) || "\r".contentEquals(s) || "\r\n".contentEquals(s)
    58 
    59   def is_wellformed(s: CharSequence): Boolean =
    60     s.length == 1 && is_plain(s.charAt(0)) || !bad_symbol.pattern.matcher(s).matches
    61 
    62   class Matcher(text: CharSequence)
    63   {
    64     private val matcher = regex.pattern.matcher(text)
    65     def apply(start: Int, end: Int): Int =
    66     {
    67       require(0 <= start && start < end && end <= text.length)
    68       if (is_plain(text.charAt(start))) 1
    69       else {
    70         matcher.region(start, end).lookingAt
    71         matcher.group.length
    72       }
    73     }
    74   }
    75 
    76 
    77   /* iterator */
    78 
    79   def iterator(text: CharSequence) = new Iterator[CharSequence]
    80   {
    81     private val matcher = new Matcher(text)
    82     private var i = 0
    83     def hasNext = i < text.length
    84     def next =
    85     {
    86       val n = matcher(i, text.length)
    87       val s = text.subSequence(i, i + n)
    88       i += n
    89       s
    90     }
    91   }
    92 
    93 
    94   /* decoding offsets */
    95 
    96   class Index(text: CharSequence)
    97   {
    98     case class Entry(chr: Int, sym: Int)
    99     val index: Array[Entry] =
   100     {
   101       val matcher = new Matcher(text)
   102       val buf = new mutable.ArrayBuffer[Entry]
   103       var chr = 0
   104       var sym = 0
   105       while (chr < text.length) {
   106         val n = matcher(chr, text.length)
   107         chr += n
   108         sym += 1
   109         if (n > 1) buf += Entry(chr, sym)
   110       }
   111       buf.toArray
   112     }
   113     def decode(sym1: Int): Int =
   114     {
   115       val sym = sym1 - 1
   116       val end = index.length
   117       def bisect(a: Int, b: Int): Int =
   118       {
   119         if (a < b) {
   120           val c = (a + b) / 2
   121           if (sym < index(c).sym) bisect(a, c)
   122           else if (c + 1 == end || sym < index(c + 1).sym) c
   123           else bisect(c + 1, b)
   124         }
   125         else -1
   126       }
   127       val i = bisect(0, end)
   128       if (i < 0) sym
   129       else index(i).chr + sym - index(i).sym
   130     }
   131     def decode(range: Text.Range): Text.Range = range.map(decode(_))
   132   }
   133 
   134 
   135   /* recoding text */
   136 
   137   private class Recoder(list: List[(String, String)])
   138   {
   139     private val (min, max) =
   140     {
   141       var min = '\uffff'
   142       var max = '\u0000'
   143       for ((x, _) <- list) {
   144         val c = x(0)
   145         if (c < min) min = c
   146         if (c > max) max = c
   147       }
   148       (min, max)
   149     }
   150     private val table =
   151     {
   152       var tab = Map[String, String]()
   153       for ((x, y) <- list) {
   154         tab.get(x) match {
   155           case None => tab += (x -> y)
   156           case Some(z) =>
   157             error("Duplicate mapping of \"" + x + "\" to \"" + y + "\" vs. \"" + z + "\"")
   158         }
   159       }
   160       tab
   161     }
   162     def recode(text: String): String =
   163     {
   164       val len = text.length
   165       val matcher = regex.pattern.matcher(text)
   166       val result = new StringBuilder(len)
   167       var i = 0
   168       while (i < len) {
   169         val c = text(i)
   170         if (min <= c && c <= max) {
   171           matcher.region(i, len).lookingAt
   172           val x = matcher.group
   173           result.append(table.get(x) getOrElse x)
   174           i = matcher.end
   175         }
   176         else { result.append(c); i += 1 }
   177       }
   178       result.toString
   179     }
   180   }
   181 
   182 
   183 
   184   /** Symbol interpretation **/
   185 
   186   class Interpretation(symbol_decls: List[String])
   187   {
   188     /* read symbols */
   189 
   190     private val empty = new Regex("""(?xs) ^\s* (?: \#.* )? $ """)
   191     private val key = new Regex("""(?xs) (.+): """)
   192 
   193     private def read_decl(decl: String): (String, Map[String, String]) =
   194     {
   195       def err() = error("Bad symbol declaration: " + decl)
   196 
   197       def read_props(props: List[String]): Map[String, String] =
   198       {
   199         props match {
   200           case Nil => Map()
   201           case _ :: Nil => err()
   202           case key(x) :: y :: rest => read_props(rest) + (x -> y)
   203           case _ => err()
   204         }
   205       }
   206       decl.split("\\s+").toList match {
   207         case sym :: props if sym.length > 1 && is_wellformed(sym) => (sym, read_props(props))
   208         case _ => err()
   209       }
   210     }
   211 
   212     private val symbols: List[(String, Map[String, String])] =
   213       Map((
   214         for (decl <- symbol_decls if !empty.pattern.matcher(decl).matches)
   215           yield read_decl(decl)): _*) toList
   216 
   217 
   218     /* misc properties */
   219 
   220     val names: Map[String, String] =
   221     {
   222       val name = new Regex("""\\<([A-Za-z][A-Za-z0-9_']*)>""")
   223       Map((for ((sym @ name(a), _) <- symbols) yield (sym -> a)): _*)
   224     }
   225 
   226     val abbrevs: Map[String, String] =
   227       Map((
   228         for ((sym, props) <- symbols if props.isDefinedAt("abbrev"))
   229           yield (sym -> props("abbrev"))): _*)
   230 
   231 
   232     /* main recoder methods */
   233 
   234     private val (decoder, encoder) =
   235     {
   236       val mapping =
   237         for {
   238           (sym, props) <- symbols
   239           val code =
   240             try { Integer.decode(props("code")).intValue }
   241             catch {
   242               case _: NoSuchElementException => error("Missing code for symbol " + sym)
   243               case _: NumberFormatException => error("Bad code for symbol " + sym)
   244             }
   245           val ch = new String(Character.toChars(code))
   246         } yield {
   247           if (code < 128) error("Illegal ASCII code for symbol " + sym)
   248           else (sym, ch)
   249         }
   250       (new Recoder(mapping),
   251        new Recoder(mapping map { case (x, y) => (y, x) }))
   252     }
   253 
   254     def decode(text: String): String = decoder.recode(text)
   255     def encode(text: String): String = encoder.recode(text)
   256 
   257 
   258     /* classification */
   259 
   260     private object Decode_Set
   261     {
   262       def apply(elems: String*): Set[String] =
   263       {
   264         val content = elems.toList
   265         Set((content ::: content.map(decode)): _*)
   266       }
   267     }
   268 
   269     private val letters = Decode_Set(
   270       "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M",
   271       "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z",
   272       "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m",
   273       "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z",
   274 
   275       "\\<A>", "\\<B>", "\\<C>", "\\<D>", "\\<E>", "\\<F>", "\\<G>",
   276       "\\<H>", "\\<I>", "\\<J>", "\\<K>", "\\<L>", "\\<M>", "\\<N>",
   277       "\\<O>", "\\<P>", "\\<Q>", "\\<R>", "\\<S>", "\\<T>", "\\<U>",
   278       "\\<V>", "\\<W>", "\\<X>", "\\<Y>", "\\<Z>", "\\<a>", "\\<b>",
   279       "\\<c>", "\\<d>", "\\<e>", "\\<f>", "\\<g>", "\\<h>", "\\<i>",
   280       "\\<j>", "\\<k>", "\\<l>", "\\<m>", "\\<n>", "\\<o>", "\\<p>",
   281       "\\<q>", "\\<r>", "\\<s>", "\\<t>", "\\<u>", "\\<v>", "\\<w>",
   282       "\\<x>", "\\<y>", "\\<z>",
   283 
   284       "\\<AA>", "\\<BB>", "\\<CC>", "\\<DD>", "\\<EE>", "\\<FF>",
   285       "\\<GG>", "\\<HH>", "\\<II>", "\\<JJ>", "\\<KK>", "\\<LL>",
   286       "\\<MM>", "\\<NN>", "\\<OO>", "\\<PP>", "\\<QQ>", "\\<RR>",
   287       "\\<SS>", "\\<TT>", "\\<UU>", "\\<VV>", "\\<WW>", "\\<XX>",
   288       "\\<YY>", "\\<ZZ>", "\\<aa>", "\\<bb>", "\\<cc>", "\\<dd>",
   289       "\\<ee>", "\\<ff>", "\\<gg>", "\\<hh>", "\\<ii>", "\\<jj>",
   290       "\\<kk>", "\\<ll>", "\\<mm>", "\\<nn>", "\\<oo>", "\\<pp>",
   291       "\\<qq>", "\\<rr>", "\\<ss>", "\\<tt>", "\\<uu>", "\\<vv>",
   292       "\\<ww>", "\\<xx>", "\\<yy>", "\\<zz>",
   293 
   294       "\\<alpha>", "\\<beta>", "\\<gamma>", "\\<delta>", "\\<epsilon>",
   295       "\\<zeta>", "\\<eta>", "\\<theta>", "\\<iota>", "\\<kappa>",
   296       "\\<mu>", "\\<nu>", "\\<xi>", "\\<pi>", "\\<rho>", "\\<sigma>",
   297       "\\<tau>", "\\<upsilon>", "\\<phi>", "\\<chi>", "\\<psi>",
   298       "\\<omega>", "\\<Gamma>", "\\<Delta>", "\\<Theta>", "\\<Lambda>",
   299       "\\<Xi>", "\\<Pi>", "\\<Sigma>", "\\<Upsilon>", "\\<Phi>",
   300       "\\<Psi>", "\\<Omega>",
   301 
   302       "\\<^isub>", "\\<^isup>")
   303 
   304     private val blanks =
   305       Decode_Set(space, "\t", "\n", "\u000B", "\f", "\r", "\\<spacespace>", "\\<^newline>")
   306 
   307     private val sym_chars =
   308       Set("!", "#", "$", "%", "&", "*", "+", "-", "/", "<", "=", ">", "?", "@", "^", "_", "|", "~")
   309 
   310     def is_letter(sym: String): Boolean = letters.contains(sym)
   311     def is_digit(sym: String): Boolean = sym.length == 1 && '0' <= sym(0) && sym(0) <= '9'
   312     def is_quasi(sym: String): Boolean = sym == "_" || sym == "'"
   313     def is_letdig(sym: String): Boolean = is_letter(sym) || is_digit(sym) || is_quasi(sym)
   314     def is_blank(sym: String): Boolean = blanks.contains(sym)
   315     def is_symbolic_char(sym: String): Boolean = sym_chars.contains(sym)
   316     def is_symbolic(sym: String): Boolean = sym.startsWith("\\<") && !sym.startsWith("\\<^")
   317   }
   318 }