src/Pure/General/symbol.scala
author wenzelm
Tue, 21 Jun 2011 13:29:44 +0200
changeset 44360 132f99cc0a43
parent 44359 39035276927c
child 44361 5e6f76cacb93
permissions -rw-r--r--
tuned iteration over short symbols;
     1 /*  Title:      Pure/General/symbol.scala
     2     Author:     Makarius
     3 
     4 Detecting and recoding Isabelle symbols.
     5 */
     6 
     7 package isabelle
     8 
     9 import scala.io.Source
    10 import scala.collection.mutable
    11 import scala.util.matching.Regex
    12 
    13 
    14 object Symbol
    15 {
    16   /* spaces */
    17 
    18   val spc = ' '
    19   val space = " "
    20 
    21   private val static_spaces = space * 4000
    22 
    23   def spaces(k: Int): String =
    24   {
    25     require(k >= 0)
    26     if (k < static_spaces.length) static_spaces.substring(0, k)
    27     else space * k
    28   }
    29 
    30 
    31   /* ASCII characters */
    32 
    33   def is_ascii_letter(c: Char): Boolean = 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z'
    34   def is_ascii_digit(c: Char): Boolean = '0' <= c && c <= '9'
    35   def is_ascii_quasi(c: Char): Boolean = c == '_' || c == '\''
    36 
    37   def is_ascii_letdig(c: Char): Boolean =
    38     is_ascii_letter(c) || is_ascii_digit(c) || is_ascii_quasi(c)
    39 
    40   def is_ascii_identifier(s: String): Boolean =
    41     s.length > 0 && is_ascii_letter(s(0)) && s.substring(1).forall(is_ascii_letdig)
    42 
    43 
    44   /* Symbol regexps */
    45 
    46   private val plain = new Regex("""(?xs)
    47       [^\r\\\ud800-\udfff\ufffd] | [\ud800-\udbff][\udc00-\udfff] """)
    48 
    49   private val physical_newline = new Regex("""(?xs) \n | \r\n | \r """)
    50 
    51   private val symbol = new Regex("""(?xs)
    52       \\ < (?:
    53       \^? [A-Za-z][A-Za-z0-9_']* |
    54       \^raw: [\x20-\x7e\u0100-\uffff && [^.>]]* ) >""")
    55 
    56   private val malformed_symbol = new Regex("(?xs) (?!" + symbol + ")" +
    57     """ [\ud800-\udbff\ufffd] | \\<\^? """)
    58 
    59   val regex_total =
    60     new Regex(plain + "|" + physical_newline + "|" + symbol + "|" + malformed_symbol + "| .")
    61 
    62 
    63   /* basic matching */
    64 
    65   def is_plain(c: Char): Boolean = !(c == '\r' || c == '\\' || '\ud800' <= c && c <= '\udfff')
    66 
    67   def is_physical_newline(s: CharSequence): Boolean =
    68     "\n".contentEquals(s) || "\r".contentEquals(s) || "\r\n".contentEquals(s)
    69 
    70   def is_malformed(s: CharSequence): Boolean =
    71     !(s.length == 1 && is_plain(s.charAt(0))) && malformed_symbol.pattern.matcher(s).matches
    72 
    73   class Matcher(text: CharSequence)
    74   {
    75     private val matcher = regex_total.pattern.matcher(text)
    76     def apply(start: Int, end: Int): Int =
    77     {
    78       require(0 <= start && start < end && end <= text.length)
    79       if (is_plain(text.charAt(start))) 1
    80       else {
    81         matcher.region(start, end).lookingAt
    82         matcher.group.length
    83       }
    84     }
    85   }
    86 
    87 
    88   /* efficient iterators */
    89 
    90   def iterator(text: CharSequence): Iterator[CharSequence] =
    91     new Iterator[CharSequence]
    92     {
    93       private val matcher = new Matcher(text)
    94       private var i = 0
    95       def hasNext = i < text.length
    96       def next =
    97       {
    98         val n = matcher(i, text.length)
    99         val s = text.subSequence(i, i + n)
   100         i += n
   101         s
   102       }
   103     }
   104 
   105   private val char_symbols: Array[String] =
   106     (0 to 127).iterator.map(i => new String(Array(i.toChar))).toArray
   107 
   108   private def make_string(sym: CharSequence): String =
   109     sym.length match {
   110       case 0 => ""
   111       case 1 =>
   112         val c = sym.charAt(0)
   113         if (c < char_symbols.length) char_symbols(c)
   114         else sym.toString
   115       case _ => sym.toString
   116     }
   117 
   118   def iterator_string(text: CharSequence): Iterator[String] =
   119     iterator(text).map(make_string)
   120 
   121 
   122   /* decoding offsets */
   123 
   124   class Index(text: CharSequence)
   125   {
   126     case class Entry(chr: Int, sym: Int)
   127     val index: Array[Entry] =
   128     {
   129       val matcher = new Matcher(text)
   130       val buf = new mutable.ArrayBuffer[Entry]
   131       var chr = 0
   132       var sym = 0
   133       while (chr < text.length) {
   134         val n = matcher(chr, text.length)
   135         chr += n
   136         sym += 1
   137         if (n > 1) buf += Entry(chr, sym)
   138       }
   139       buf.toArray
   140     }
   141     def decode(sym1: Int): Int =
   142     {
   143       val sym = sym1 - 1
   144       val end = index.length
   145       def bisect(a: Int, b: Int): Int =
   146       {
   147         if (a < b) {
   148           val c = (a + b) / 2
   149           if (sym < index(c).sym) bisect(a, c)
   150           else if (c + 1 == end || sym < index(c + 1).sym) c
   151           else bisect(c + 1, b)
   152         }
   153         else -1
   154       }
   155       val i = bisect(0, end)
   156       if (i < 0) sym
   157       else index(i).chr + sym - index(i).sym
   158     }
   159     def decode(range: Text.Range): Text.Range = range.map(decode(_))
   160   }
   161 
   162 
   163   /* recoding text */
   164 
   165   private class Recoder(list: List[(String, String)])
   166   {
   167     private val (min, max) =
   168     {
   169       var min = '\uffff'
   170       var max = '\u0000'
   171       for ((x, _) <- list) {
   172         val c = x(0)
   173         if (c < min) min = c
   174         if (c > max) max = c
   175       }
   176       (min, max)
   177     }
   178     private val table =
   179     {
   180       var tab = Map[String, String]()
   181       for ((x, y) <- list) {
   182         tab.get(x) match {
   183           case None => tab += (x -> y)
   184           case Some(z) =>
   185             error("Duplicate mapping of \"" + x + "\" to \"" + y + "\" vs. \"" + z + "\"")
   186         }
   187       }
   188       tab
   189     }
   190     def recode(text: String): String =
   191     {
   192       val len = text.length
   193       val matcher = regex_total.pattern.matcher(text)
   194       val result = new StringBuilder(len)
   195       var i = 0
   196       while (i < len) {
   197         val c = text(i)
   198         if (min <= c && c <= max) {
   199           matcher.region(i, len).lookingAt
   200           val x = matcher.group
   201           result.append(table.get(x) getOrElse x)
   202           i = matcher.end
   203         }
   204         else { result.append(c); i += 1 }
   205       }
   206       result.toString
   207     }
   208   }
   209 
   210 
   211 
   212   /** Symbol interpretation **/
   213 
   214   class Interpretation(symbol_decls: List[String])
   215   {
   216     /* read symbols */
   217 
   218     private val empty = new Regex("""(?xs) ^\s* (?: \#.* )? $ """)
   219     private val key = new Regex("""(?xs) (.+): """)
   220 
   221     private def read_decl(decl: String): (String, Map[String, String]) =
   222     {
   223       def err() = error("Bad symbol declaration: " + decl)
   224 
   225       def read_props(props: List[String]): Map[String, String] =
   226       {
   227         props match {
   228           case Nil => Map()
   229           case _ :: Nil => err()
   230           case key(x) :: y :: rest => read_props(rest) + (x -> y)
   231           case _ => err()
   232         }
   233       }
   234       decl.split("\\s+").toList match {
   235         case sym :: props if sym.length > 1 && !is_malformed(sym) => (sym, read_props(props))
   236         case _ => err()
   237       }
   238     }
   239 
   240     private val symbols: List[(String, Map[String, String])] =
   241       Map((
   242         for (decl <- symbol_decls if !empty.pattern.matcher(decl).matches)
   243           yield read_decl(decl)): _*) toList
   244 
   245 
   246     /* misc properties */
   247 
   248     val names: Map[String, String] =
   249     {
   250       val name = new Regex("""\\<\^?([A-Za-z][A-Za-z0-9_']*)>""")
   251       Map((for ((sym @ name(a), _) <- symbols) yield (sym -> a)): _*)
   252     }
   253 
   254     val abbrevs: Map[String, String] =
   255       Map((
   256         for ((sym, props) <- symbols if props.isDefinedAt("abbrev"))
   257           yield (sym -> props("abbrev"))): _*)
   258 
   259 
   260     /* user fonts */
   261 
   262     val fonts: Map[String, String] =
   263       Map((
   264         for ((sym, props) <- symbols if props.isDefinedAt("font"))
   265           yield (sym -> props("font"))): _*)
   266 
   267     val font_names: List[String] = Set(fonts.toList.map(_._2): _*).toList
   268     val font_index: Map[String, Int] = Map((font_names zip (0 until font_names.length).toList): _*)
   269 
   270     def lookup_font(sym: String): Option[Int] = fonts.get(sym).map(font_index(_))
   271 
   272 
   273     /* main recoder methods */
   274 
   275     private val (decoder, encoder) =
   276     {
   277       val mapping =
   278         for {
   279           (sym, props) <- symbols
   280           val code =
   281             try { Integer.decode(props("code")).intValue }
   282             catch {
   283               case _: NoSuchElementException => error("Missing code for symbol " + sym)
   284               case _: NumberFormatException => error("Bad code for symbol " + sym)
   285             }
   286           val ch = new String(Character.toChars(code))
   287         } yield {
   288           if (code < 128) error("Illegal ASCII code for symbol " + sym)
   289           else (sym, ch)
   290         }
   291       (new Recoder(mapping),
   292        new Recoder(mapping map { case (x, y) => (y, x) }))
   293     }
   294 
   295     def decode(text: String): String = decoder.recode(text)
   296     def encode(text: String): String = encoder.recode(text)
   297 
   298 
   299     /* classification */
   300 
   301     private object Decode_Set
   302     {
   303       def apply(elems: String*): Set[String] =
   304       {
   305         val content = elems.toList
   306         Set((content ::: content.map(decode)): _*)
   307       }
   308     }
   309 
   310     private val letters = Decode_Set(
   311       "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M",
   312       "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z",
   313       "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m",
   314       "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z",
   315 
   316       "\\<A>", "\\<B>", "\\<C>", "\\<D>", "\\<E>", "\\<F>", "\\<G>",
   317       "\\<H>", "\\<I>", "\\<J>", "\\<K>", "\\<L>", "\\<M>", "\\<N>",
   318       "\\<O>", "\\<P>", "\\<Q>", "\\<R>", "\\<S>", "\\<T>", "\\<U>",
   319       "\\<V>", "\\<W>", "\\<X>", "\\<Y>", "\\<Z>", "\\<a>", "\\<b>",
   320       "\\<c>", "\\<d>", "\\<e>", "\\<f>", "\\<g>", "\\<h>", "\\<i>",
   321       "\\<j>", "\\<k>", "\\<l>", "\\<m>", "\\<n>", "\\<o>", "\\<p>",
   322       "\\<q>", "\\<r>", "\\<s>", "\\<t>", "\\<u>", "\\<v>", "\\<w>",
   323       "\\<x>", "\\<y>", "\\<z>",
   324 
   325       "\\<AA>", "\\<BB>", "\\<CC>", "\\<DD>", "\\<EE>", "\\<FF>",
   326       "\\<GG>", "\\<HH>", "\\<II>", "\\<JJ>", "\\<KK>", "\\<LL>",
   327       "\\<MM>", "\\<NN>", "\\<OO>", "\\<PP>", "\\<QQ>", "\\<RR>",
   328       "\\<SS>", "\\<TT>", "\\<UU>", "\\<VV>", "\\<WW>", "\\<XX>",
   329       "\\<YY>", "\\<ZZ>", "\\<aa>", "\\<bb>", "\\<cc>", "\\<dd>",
   330       "\\<ee>", "\\<ff>", "\\<gg>", "\\<hh>", "\\<ii>", "\\<jj>",
   331       "\\<kk>", "\\<ll>", "\\<mm>", "\\<nn>", "\\<oo>", "\\<pp>",
   332       "\\<qq>", "\\<rr>", "\\<ss>", "\\<tt>", "\\<uu>", "\\<vv>",
   333       "\\<ww>", "\\<xx>", "\\<yy>", "\\<zz>",
   334 
   335       "\\<alpha>", "\\<beta>", "\\<gamma>", "\\<delta>", "\\<epsilon>",
   336       "\\<zeta>", "\\<eta>", "\\<theta>", "\\<iota>", "\\<kappa>",
   337       "\\<mu>", "\\<nu>", "\\<xi>", "\\<pi>", "\\<rho>", "\\<sigma>",
   338       "\\<tau>", "\\<upsilon>", "\\<phi>", "\\<chi>", "\\<psi>",
   339       "\\<omega>", "\\<Gamma>", "\\<Delta>", "\\<Theta>", "\\<Lambda>",
   340       "\\<Xi>", "\\<Pi>", "\\<Sigma>", "\\<Upsilon>", "\\<Phi>",
   341       "\\<Psi>", "\\<Omega>",
   342 
   343       "\\<^isub>", "\\<^isup>")
   344 
   345     private val blanks =
   346       Decode_Set(space, "\t", "\n", "\u000B", "\f", "\r", "\\<spacespace>", "\\<^newline>")
   347 
   348     private val sym_chars =
   349       Set("!", "#", "$", "%", "&", "*", "+", "-", "/", "<", "=", ">", "?", "@", "^", "_", "|", "~")
   350 
   351     def is_letter(sym: String): Boolean = letters.contains(sym)
   352     def is_digit(sym: String): Boolean = sym.length == 1 && '0' <= sym(0) && sym(0) <= '9'
   353     def is_quasi(sym: String): Boolean = sym == "_" || sym == "'"
   354     def is_letdig(sym: String): Boolean = is_letter(sym) || is_digit(sym) || is_quasi(sym)
   355     def is_blank(sym: String): Boolean = blanks.contains(sym)
   356     def is_symbolic_char(sym: String): Boolean = sym_chars.contains(sym)
   357     def is_symbolic(sym: String): Boolean =
   358       sym.startsWith("\\<") && sym.endsWith(">") && !sym.startsWith("\\<^")
   359 
   360 
   361     /* control symbols */
   362 
   363     private val ctrl_decoded: Set[String] =
   364       Set((for ((sym, _) <- symbols if sym.startsWith("\\<^")) yield decode(sym)): _*)
   365 
   366     def is_ctrl(sym: String): Boolean =
   367       sym.startsWith("\\<^") || ctrl_decoded.contains(sym)
   368 
   369     def is_controllable(sym: String): Boolean =
   370       !is_blank(sym) && !is_ctrl(sym) && !is_malformed(sym)
   371 
   372     private val subscript_decoded = Set(decode("\\<^sub>"), decode("\\<^isub>"))
   373     private val superscript_decoded = Set(decode("\\<^sup>"), decode("\\<^isup>"))
   374     val bold_decoded = decode("\\<^bold>")
   375 
   376     def is_subscript_decoded(sym: String): Boolean = subscript_decoded.contains(sym)
   377     def is_superscript_decoded(sym: String): Boolean = superscript_decoded.contains(sym)
   378     def is_bold_decoded(sym: String): Boolean = sym == bold_decoded
   379   }
   380 }