src/Pure/General/symbol.scala
author wenzelm
Thu, 07 Jul 2011 13:48:30 +0200
changeset 44569 5130dfe1b7be
parent 44550 8252d51d70e2
child 44570 58bb7ca5c7a2
permissions -rw-r--r--
simplified Symbol based on lazy Symbol.Interpretation -- reduced odd "functorial style";
tuned implicit build/init messages;
     1 /*  Title:      Pure/General/symbol.scala
     2     Author:     Makarius
     3 
     4 Detecting and recoding Isabelle symbols.
     5 */
     6 
     7 package isabelle
     8 
     9 import scala.io.Source
    10 import scala.collection.mutable
    11 import scala.util.matching.Regex
    12 
    13 
    14 object Symbol
    15 {
    16   /* spaces */
    17 
    18   val spc = ' '
    19   val space = " "
    20 
    21   private val static_spaces = space * 4000
    22 
    23   def spaces(k: Int): String =
    24   {
    25     require(k >= 0)
    26     if (k < static_spaces.length) static_spaces.substring(0, k)
    27     else space * k
    28   }
    29 
    30 
    31   /* ASCII characters */
    32 
    33   def is_ascii_letter(c: Char): Boolean = 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z'
    34   def is_ascii_digit(c: Char): Boolean = '0' <= c && c <= '9'
    35   def is_ascii_quasi(c: Char): Boolean = c == '_' || c == '\''
    36 
    37   def is_ascii_letdig(c: Char): Boolean =
    38     is_ascii_letter(c) || is_ascii_digit(c) || is_ascii_quasi(c)
    39 
    40   def is_ascii_identifier(s: String): Boolean =
    41     s.length > 0 && is_ascii_letter(s(0)) && s.substring(1).forall(is_ascii_letdig)
    42 
    43 
    44   /* Symbol regexps */
    45 
    46   private val plain = new Regex("""(?xs)
    47       [^\r\\\ud800-\udfff\ufffd] | [\ud800-\udbff][\udc00-\udfff] """)
    48 
    49   private val physical_newline = new Regex("""(?xs) \n | \r\n | \r """)
    50 
    51   private val symbol = new Regex("""(?xs)
    52       \\ < (?:
    53       \^? [A-Za-z][A-Za-z0-9_']* |
    54       \^raw: [\x20-\x7e\u0100-\uffff && [^.>]]* ) >""")
    55 
    56   private val malformed_symbol = new Regex("(?xs) (?!" + symbol + ")" +
    57     """ [\ud800-\udbff\ufffd] | \\<\^? """)
    58 
    59   val regex_total =
    60     new Regex(plain + "|" + physical_newline + "|" + symbol + "|" + malformed_symbol + "| .")
    61 
    62 
    63   /* basic matching */
    64 
    65   def is_plain(c: Char): Boolean = !(c == '\r' || c == '\\' || '\ud800' <= c && c <= '\udfff')
    66 
    67   def is_physical_newline(s: String): Boolean =
    68     s == "\n" || s == "\r" || s == "\r\n"
    69 
    70   def is_malformed(s: String): Boolean =
    71     !(s.length == 1 && is_plain(s(0))) && malformed_symbol.pattern.matcher(s).matches
    72 
    73   class Matcher(text: CharSequence)
    74   {
    75     private val matcher = regex_total.pattern.matcher(text)
    76     def apply(start: Int, end: Int): Int =
    77     {
    78       require(0 <= start && start < end && end <= text.length)
    79       if (is_plain(text.charAt(start))) 1
    80       else {
    81         matcher.region(start, end).lookingAt
    82         matcher.group.length
    83       }
    84     }
    85   }
    86 
    87 
    88   /* iterator */
    89 
    90   private val char_symbols: Array[String] =
    91     (0 until 256).iterator.map(i => new String(Array(i.toChar))).toArray
    92 
    93   def iterator(text: CharSequence): Iterator[String] =
    94     new Iterator[String]
    95     {
    96       private val matcher = new Matcher(text)
    97       private var i = 0
    98       def hasNext = i < text.length
    99       def next =
   100       {
   101         val n = matcher(i, text.length)
   102         val s =
   103           if (n == 0) ""
   104           else if (n == 1) {
   105             val c = text.charAt(i)
   106             if (c < char_symbols.length) char_symbols(c)
   107             else text.subSequence(i, i + n).toString
   108           }
   109           else text.subSequence(i, i + n).toString
   110         i += n
   111         s
   112       }
   113     }
   114 
   115 
   116   /* decoding offsets */
   117 
   118   class Index(text: CharSequence)
   119   {
   120     case class Entry(chr: Int, sym: Int)
   121     val index: Array[Entry] =
   122     {
   123       val matcher = new Matcher(text)
   124       val buf = new mutable.ArrayBuffer[Entry]
   125       var chr = 0
   126       var sym = 0
   127       while (chr < text.length) {
   128         val n = matcher(chr, text.length)
   129         chr += n
   130         sym += 1
   131         if (n > 1) buf += Entry(chr, sym)
   132       }
   133       buf.toArray
   134     }
   135     def decode(sym1: Int): Int =
   136     {
   137       val sym = sym1 - 1
   138       val end = index.length
   139       def bisect(a: Int, b: Int): Int =
   140       {
   141         if (a < b) {
   142           val c = (a + b) / 2
   143           if (sym < index(c).sym) bisect(a, c)
   144           else if (c + 1 == end || sym < index(c + 1).sym) c
   145           else bisect(c + 1, b)
   146         }
   147         else -1
   148       }
   149       val i = bisect(0, end)
   150       if (i < 0) sym
   151       else index(i).chr + sym - index(i).sym
   152     }
   153     def decode(range: Text.Range): Text.Range = range.map(decode(_))
   154   }
   155 
   156 
   157   /* recoding text */
   158 
   159   private class Recoder(list: List[(String, String)])
   160   {
   161     private val (min, max) =
   162     {
   163       var min = '\uffff'
   164       var max = '\u0000'
   165       for ((x, _) <- list) {
   166         val c = x(0)
   167         if (c < min) min = c
   168         if (c > max) max = c
   169       }
   170       (min, max)
   171     }
   172     private val table =
   173     {
   174       var tab = Map[String, String]()
   175       for ((x, y) <- list) {
   176         tab.get(x) match {
   177           case None => tab += (x -> y)
   178           case Some(z) =>
   179             error("Duplicate mapping of \"" + x + "\" to \"" + y + "\" vs. \"" + z + "\"")
   180         }
   181       }
   182       tab
   183     }
   184     def recode(text: String): String =
   185     {
   186       val len = text.length
   187       val matcher = regex_total.pattern.matcher(text)
   188       val result = new StringBuilder(len)
   189       var i = 0
   190       while (i < len) {
   191         val c = text(i)
   192         if (min <= c && c <= max) {
   193           matcher.region(i, len).lookingAt
   194           val x = matcher.group
   195           result.append(table.get(x) getOrElse x)
   196           i = matcher.end
   197         }
   198         else { result.append(c); i += 1 }
   199       }
   200       result.toString
   201     }
   202   }
   203 
   204 
   205 
   206   /** symbol interpretation **/
   207 
   208   private lazy val symbols =
   209     new Interpretation(
   210       Isabelle_System.try_read(Path.split(Isabelle_System.getenv_strict("ISABELLE_SYMBOLS"))))
   211 
   212   private class Interpretation(symbols_spec: String)
   213   {
   214     /* read symbols */
   215 
   216     private val empty = new Regex("""(?xs) ^\s* (?: \#.* )? $ """)
   217     private val key = new Regex("""(?xs) (.+): """)
   218 
   219     private def read_decl(decl: String): (String, Map[String, String]) =
   220     {
   221       def err() = error("Bad symbol declaration: " + decl)
   222 
   223       def read_props(props: List[String]): Map[String, String] =
   224       {
   225         props match {
   226           case Nil => Map()
   227           case _ :: Nil => err()
   228           case key(x) :: y :: rest => read_props(rest) + (x -> y)
   229           case _ => err()
   230         }
   231       }
   232       decl.split("\\s+").toList match {
   233         case sym :: props if sym.length > 1 && !is_malformed(sym) => (sym, read_props(props))
   234         case _ => err()
   235       }
   236     }
   237 
   238     private val symbols: List[(String, Map[String, String])] =
   239       Map((
   240         for (decl <- split_lines(symbols_spec) if !empty.pattern.matcher(decl).matches)
   241           yield read_decl(decl)): _*) toList
   242 
   243 
   244     /* misc properties */
   245 
   246     val names: Map[String, String] =
   247     {
   248       val name = new Regex("""\\<\^?([A-Za-z][A-Za-z0-9_']*)>""")
   249       Map((for ((sym @ name(a), _) <- symbols) yield (sym -> a)): _*)
   250     }
   251 
   252     val abbrevs: Map[String, String] =
   253       Map((
   254         for ((sym, props) <- symbols if props.isDefinedAt("abbrev"))
   255           yield (sym -> props("abbrev"))): _*)
   256 
   257 
   258     /* recoding */
   259 
   260     private val (decoder, encoder) =
   261     {
   262       val mapping =
   263         for {
   264           (sym, props) <- symbols
   265           val code =
   266             try { Integer.decode(props("code")).intValue }
   267             catch {
   268               case _: NoSuchElementException => error("Missing code for symbol " + sym)
   269               case _: NumberFormatException => error("Bad code for symbol " + sym)
   270             }
   271           val ch = new String(Character.toChars(code))
   272         } yield {
   273           if (code < 128) error("Illegal ASCII code for symbol " + sym)
   274           else (sym, ch)
   275         }
   276       (new Recoder(mapping),
   277        new Recoder(mapping map { case (x, y) => (y, x) }))
   278     }
   279 
   280     def decode(text: String): String = decoder.recode(text)
   281     def encode(text: String): String = encoder.recode(text)
   282 
   283     private def recode_set(elems: String*): Set[String] =
   284     {
   285       val content = elems.toList
   286       Set((content ::: content.map(decode)): _*)
   287     }
   288 
   289     private def recode_map[A](elems: (String, A)*): Map[String, A] =
   290     {
   291       val content = elems.toList
   292       Map((content ::: content.map({ case (sym, a) => (decode(sym), a) })): _*)
   293     }
   294 
   295 
   296     /* user fonts */
   297 
   298     val fonts: Map[String, String] =
   299       recode_map((
   300         for ((sym, props) <- symbols if props.isDefinedAt("font"))
   301           yield (sym -> props("font"))): _*)
   302 
   303     val font_names: List[String] = Set(fonts.toList.map(_._2): _*).toList
   304     val font_index: Map[String, Int] = Map((font_names zip (0 until font_names.length).toList): _*)
   305 
   306 
   307     /* classification */
   308 
   309     val letters = recode_set(
   310       "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M",
   311       "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z",
   312       "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m",
   313       "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z",
   314 
   315       "\\<A>", "\\<B>", "\\<C>", "\\<D>", "\\<E>", "\\<F>", "\\<G>",
   316       "\\<H>", "\\<I>", "\\<J>", "\\<K>", "\\<L>", "\\<M>", "\\<N>",
   317       "\\<O>", "\\<P>", "\\<Q>", "\\<R>", "\\<S>", "\\<T>", "\\<U>",
   318       "\\<V>", "\\<W>", "\\<X>", "\\<Y>", "\\<Z>", "\\<a>", "\\<b>",
   319       "\\<c>", "\\<d>", "\\<e>", "\\<f>", "\\<g>", "\\<h>", "\\<i>",
   320       "\\<j>", "\\<k>", "\\<l>", "\\<m>", "\\<n>", "\\<o>", "\\<p>",
   321       "\\<q>", "\\<r>", "\\<s>", "\\<t>", "\\<u>", "\\<v>", "\\<w>",
   322       "\\<x>", "\\<y>", "\\<z>",
   323 
   324       "\\<AA>", "\\<BB>", "\\<CC>", "\\<DD>", "\\<EE>", "\\<FF>",
   325       "\\<GG>", "\\<HH>", "\\<II>", "\\<JJ>", "\\<KK>", "\\<LL>",
   326       "\\<MM>", "\\<NN>", "\\<OO>", "\\<PP>", "\\<QQ>", "\\<RR>",
   327       "\\<SS>", "\\<TT>", "\\<UU>", "\\<VV>", "\\<WW>", "\\<XX>",
   328       "\\<YY>", "\\<ZZ>", "\\<aa>", "\\<bb>", "\\<cc>", "\\<dd>",
   329       "\\<ee>", "\\<ff>", "\\<gg>", "\\<hh>", "\\<ii>", "\\<jj>",
   330       "\\<kk>", "\\<ll>", "\\<mm>", "\\<nn>", "\\<oo>", "\\<pp>",
   331       "\\<qq>", "\\<rr>", "\\<ss>", "\\<tt>", "\\<uu>", "\\<vv>",
   332       "\\<ww>", "\\<xx>", "\\<yy>", "\\<zz>",
   333 
   334       "\\<alpha>", "\\<beta>", "\\<gamma>", "\\<delta>", "\\<epsilon>",
   335       "\\<zeta>", "\\<eta>", "\\<theta>", "\\<iota>", "\\<kappa>",
   336       "\\<mu>", "\\<nu>", "\\<xi>", "\\<pi>", "\\<rho>", "\\<sigma>",
   337       "\\<tau>", "\\<upsilon>", "\\<phi>", "\\<chi>", "\\<psi>",
   338       "\\<omega>", "\\<Gamma>", "\\<Delta>", "\\<Theta>", "\\<Lambda>",
   339       "\\<Xi>", "\\<Pi>", "\\<Sigma>", "\\<Upsilon>", "\\<Phi>",
   340       "\\<Psi>", "\\<Omega>",
   341 
   342       "\\<^isub>", "\\<^isup>")
   343 
   344     val blanks =
   345       recode_set(space, "\t", "\n", "\u000B", "\f", "\r", "\\<spacespace>", "\\<^newline>")
   346 
   347     val sym_chars =
   348       Set("!", "#", "$", "%", "&", "*", "+", "-", "/", "<", "=", ">", "?", "@", "^", "_", "|", "~")
   349 
   350 
   351     /* control symbols */
   352 
   353     val ctrl_decoded: Set[String] =
   354       Set((for ((sym, _) <- symbols if sym.startsWith("\\<^")) yield decode(sym)): _*)
   355 
   356     val subscript_decoded = Set(decode("\\<^sub>"), decode("\\<^isub>"))
   357     val superscript_decoded = Set(decode("\\<^sup>"), decode("\\<^isup>"))
   358 
   359     val bold_decoded = decode("\\<^bold>")
   360     val bsub_decoded = decode("\\<^bsub>")
   361     val esub_decoded = decode("\\<^esub>")
   362     val bsup_decoded = decode("\\<^bsup>")
   363     val esup_decoded = decode("\\<^esup>")
   364   }
   365 
   366 
   367   /* tables */
   368 
   369   def names: Map[String, String] = symbols.names
   370   def abbrevs: Map[String, String] = symbols.abbrevs
   371 
   372   def decode(text: String): String = symbols.decode(text)
   373   def encode(text: String): String = symbols.encode(text)
   374 
   375   def fonts: Map[String, String] = symbols.fonts
   376   def font_names: List[String] = symbols.font_names
   377   def font_index: Map[String, Int] = symbols.font_index
   378   def lookup_font(sym: String): Option[Int] = symbols.fonts.get(sym).map(font_index(_))
   379 
   380 
   381   /* classification */
   382 
   383   def is_letter(sym: String): Boolean = symbols.letters.contains(sym)
   384   def is_digit(sym: String): Boolean = sym.length == 1 && '0' <= sym(0) && sym(0) <= '9'
   385   def is_quasi(sym: String): Boolean = sym == "_" || sym == "'"
   386   def is_letdig(sym: String): Boolean = is_letter(sym) || is_digit(sym) || is_quasi(sym)
   387   def is_blank(sym: String): Boolean = symbols.blanks.contains(sym)
   388   def is_symbolic_char(sym: String): Boolean = symbols.sym_chars.contains(sym)
   389   def is_symbolic(sym: String): Boolean =
   390     sym.startsWith("\\<") && sym.endsWith(">") && !sym.startsWith("\\<^")
   391 
   392 
   393   /* control symbols */
   394 
   395   def is_ctrl(sym: String): Boolean =
   396     sym.startsWith("\\<^") || symbols.ctrl_decoded.contains(sym)
   397 
   398   def is_controllable(sym: String): Boolean =
   399     !is_blank(sym) && !is_ctrl(sym) && !is_malformed(sym)
   400 
   401   def is_subscript_decoded(sym: String): Boolean = symbols.subscript_decoded.contains(sym)
   402   def is_superscript_decoded(sym: String): Boolean = symbols.superscript_decoded.contains(sym)
   403   def is_bold_decoded(sym: String): Boolean = sym == symbols.bold_decoded
   404   def is_bsub_decoded(sym: String): Boolean = sym == symbols.bsub_decoded
   405   def is_esub_decoded(sym: String): Boolean = sym == symbols.esub_decoded
   406   def is_bsup_decoded(sym: String): Boolean = sym == symbols.bsup_decoded
   407   def is_esup_decoded(sym: String): Boolean = sym == symbols.esup_decoded
   408 }