src/Pure/General/symbol.scala
author wenzelm
Thu, 07 Jul 2011 14:10:50 +0200
changeset 44570 58bb7ca5c7a2
parent 44569 5130dfe1b7be
child 44596 3749d1e6dde9
permissions -rw-r--r--
explicit indication of type Symbol.Symbol;
     1 /*  Title:      Pure/General/symbol.scala
     2     Author:     Makarius
     3 
     4 Detecting and recoding Isabelle symbols.
     5 */
     6 
     7 package isabelle
     8 
     9 import scala.io.Source
    10 import scala.collection.mutable
    11 import scala.util.matching.Regex
    12 
    13 
    14 object Symbol
    15 {
    16   type Symbol = String
    17 
    18 
    19   /* spaces */
    20 
    21   val spc = ' '
    22   val space = " "
    23 
    24   private val static_spaces = space * 4000
    25 
    26   def spaces(k: Int): String =
    27   {
    28     require(k >= 0)
    29     if (k < static_spaces.length) static_spaces.substring(0, k)
    30     else space * k
    31   }
    32 
    33 
    34   /* ASCII characters */
    35 
    36   def is_ascii_letter(c: Char): Boolean = 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z'
    37   def is_ascii_digit(c: Char): Boolean = '0' <= c && c <= '9'
    38   def is_ascii_quasi(c: Char): Boolean = c == '_' || c == '\''
    39 
    40   def is_ascii_letdig(c: Char): Boolean =
    41     is_ascii_letter(c) || is_ascii_digit(c) || is_ascii_quasi(c)
    42 
    43   def is_ascii_identifier(s: String): Boolean =
    44     s.length > 0 && is_ascii_letter(s(0)) && s.substring(1).forall(is_ascii_letdig)
    45 
    46 
    47   /* Symbol regexps */
    48 
    49   private val plain = new Regex("""(?xs)
    50       [^\r\\\ud800-\udfff\ufffd] | [\ud800-\udbff][\udc00-\udfff] """)
    51 
    52   private val physical_newline = new Regex("""(?xs) \n | \r\n | \r """)
    53 
    54   private val symbol = new Regex("""(?xs)
    55       \\ < (?:
    56       \^? [A-Za-z][A-Za-z0-9_']* |
    57       \^raw: [\x20-\x7e\u0100-\uffff && [^.>]]* ) >""")
    58 
    59   private val malformed_symbol = new Regex("(?xs) (?!" + symbol + ")" +
    60     """ [\ud800-\udbff\ufffd] | \\<\^? """)
    61 
    62   val regex_total =
    63     new Regex(plain + "|" + physical_newline + "|" + symbol + "|" + malformed_symbol + "| .")
    64 
    65 
    66   /* basic matching */
    67 
    68   def is_plain(c: Char): Boolean = !(c == '\r' || c == '\\' || '\ud800' <= c && c <= '\udfff')
    69 
    70   def is_physical_newline(s: Symbol): Boolean =
    71     s == "\n" || s == "\r" || s == "\r\n"
    72 
    73   def is_malformed(s: Symbol): Boolean =
    74     !(s.length == 1 && is_plain(s(0))) && malformed_symbol.pattern.matcher(s).matches
    75 
    76   class Matcher(text: CharSequence)
    77   {
    78     private val matcher = regex_total.pattern.matcher(text)
    79     def apply(start: Int, end: Int): Int =
    80     {
    81       require(0 <= start && start < end && end <= text.length)
    82       if (is_plain(text.charAt(start))) 1
    83       else {
    84         matcher.region(start, end).lookingAt
    85         matcher.group.length
    86       }
    87     }
    88   }
    89 
    90 
    91   /* iterator */
    92 
    93   private val char_symbols: Array[Symbol] =
    94     (0 until 256).iterator.map(i => new String(Array(i.toChar))).toArray
    95 
    96   def iterator(text: CharSequence): Iterator[Symbol] =
    97     new Iterator[Symbol]
    98     {
    99       private val matcher = new Matcher(text)
   100       private var i = 0
   101       def hasNext = i < text.length
   102       def next =
   103       {
   104         val n = matcher(i, text.length)
   105         val s =
   106           if (n == 0) ""
   107           else if (n == 1) {
   108             val c = text.charAt(i)
   109             if (c < char_symbols.length) char_symbols(c)
   110             else text.subSequence(i, i + n).toString
   111           }
   112           else text.subSequence(i, i + n).toString
   113         i += n
   114         s
   115       }
   116     }
   117 
   118 
   119   /* decoding offsets */
   120 
   121   class Index(text: CharSequence)
   122   {
   123     case class Entry(chr: Int, sym: Int)
   124     val index: Array[Entry] =
   125     {
   126       val matcher = new Matcher(text)
   127       val buf = new mutable.ArrayBuffer[Entry]
   128       var chr = 0
   129       var sym = 0
   130       while (chr < text.length) {
   131         val n = matcher(chr, text.length)
   132         chr += n
   133         sym += 1
   134         if (n > 1) buf += Entry(chr, sym)
   135       }
   136       buf.toArray
   137     }
   138     def decode(sym1: Int): Int =
   139     {
   140       val sym = sym1 - 1
   141       val end = index.length
   142       def bisect(a: Int, b: Int): Int =
   143       {
   144         if (a < b) {
   145           val c = (a + b) / 2
   146           if (sym < index(c).sym) bisect(a, c)
   147           else if (c + 1 == end || sym < index(c + 1).sym) c
   148           else bisect(c + 1, b)
   149         }
   150         else -1
   151       }
   152       val i = bisect(0, end)
   153       if (i < 0) sym
   154       else index(i).chr + sym - index(i).sym
   155     }
   156     def decode(range: Text.Range): Text.Range = range.map(decode(_))
   157   }
   158 
   159 
   160   /* recoding text */
   161 
   162   private class Recoder(list: List[(String, String)])
   163   {
   164     private val (min, max) =
   165     {
   166       var min = '\uffff'
   167       var max = '\u0000'
   168       for ((x, _) <- list) {
   169         val c = x(0)
   170         if (c < min) min = c
   171         if (c > max) max = c
   172       }
   173       (min, max)
   174     }
   175     private val table =
   176     {
   177       var tab = Map[String, String]()
   178       for ((x, y) <- list) {
   179         tab.get(x) match {
   180           case None => tab += (x -> y)
   181           case Some(z) =>
   182             error("Duplicate mapping of \"" + x + "\" to \"" + y + "\" vs. \"" + z + "\"")
   183         }
   184       }
   185       tab
   186     }
   187     def recode(text: String): String =
   188     {
   189       val len = text.length
   190       val matcher = regex_total.pattern.matcher(text)
   191       val result = new StringBuilder(len)
   192       var i = 0
   193       while (i < len) {
   194         val c = text(i)
   195         if (min <= c && c <= max) {
   196           matcher.region(i, len).lookingAt
   197           val x = matcher.group
   198           result.append(table.get(x) getOrElse x)
   199           i = matcher.end
   200         }
   201         else { result.append(c); i += 1 }
   202       }
   203       result.toString
   204     }
   205   }
   206 
   207 
   208 
   209   /** symbol interpretation **/
   210 
   211   private lazy val symbols =
   212     new Interpretation(
   213       Isabelle_System.try_read(Path.split(Isabelle_System.getenv_strict("ISABELLE_SYMBOLS"))))
   214 
   215   private class Interpretation(symbols_spec: String)
   216   {
   217     /* read symbols */
   218 
   219     private val empty = new Regex("""(?xs) ^\s* (?: \#.* )? $ """)
   220     private val key = new Regex("""(?xs) (.+): """)
   221 
   222     private def read_decl(decl: String): (Symbol, Map[String, String]) =
   223     {
   224       def err() = error("Bad symbol declaration: " + decl)
   225 
   226       def read_props(props: List[String]): Map[String, String] =
   227       {
   228         props match {
   229           case Nil => Map()
   230           case _ :: Nil => err()
   231           case key(x) :: y :: rest => read_props(rest) + (x -> y)
   232           case _ => err()
   233         }
   234       }
   235       decl.split("\\s+").toList match {
   236         case sym :: props if sym.length > 1 && !is_malformed(sym) => (sym, read_props(props))
   237         case _ => err()
   238       }
   239     }
   240 
   241     private val symbols: List[(Symbol, Map[String, String])] =
   242       Map((
   243         for (decl <- split_lines(symbols_spec) if !empty.pattern.matcher(decl).matches)
   244           yield read_decl(decl)): _*) toList
   245 
   246 
   247     /* misc properties */
   248 
   249     val names: Map[Symbol, String] =
   250     {
   251       val name = new Regex("""\\<\^?([A-Za-z][A-Za-z0-9_']*)>""")
   252       Map((for ((sym @ name(a), _) <- symbols) yield (sym -> a)): _*)
   253     }
   254 
   255     val abbrevs: Map[Symbol, String] =
   256       Map((
   257         for ((sym, props) <- symbols if props.isDefinedAt("abbrev"))
   258           yield (sym -> props("abbrev"))): _*)
   259 
   260 
   261     /* recoding */
   262 
   263     private val (decoder, encoder) =
   264     {
   265       val mapping =
   266         for {
   267           (sym, props) <- symbols
   268           val code =
   269             try { Integer.decode(props("code")).intValue }
   270             catch {
   271               case _: NoSuchElementException => error("Missing code for symbol " + sym)
   272               case _: NumberFormatException => error("Bad code for symbol " + sym)
   273             }
   274           val ch = new String(Character.toChars(code))
   275         } yield {
   276           if (code < 128) error("Illegal ASCII code for symbol " + sym)
   277           else (sym, ch)
   278         }
   279       (new Recoder(mapping),
   280        new Recoder(mapping map { case (x, y) => (y, x) }))
   281     }
   282 
   283     def decode(text: String): String = decoder.recode(text)
   284     def encode(text: String): String = encoder.recode(text)
   285 
   286     private def recode_set(elems: String*): Set[String] =
   287     {
   288       val content = elems.toList
   289       Set((content ::: content.map(decode)): _*)
   290     }
   291 
   292     private def recode_map[A](elems: (String, A)*): Map[String, A] =
   293     {
   294       val content = elems.toList
   295       Map((content ::: content.map({ case (sym, a) => (decode(sym), a) })): _*)
   296     }
   297 
   298 
   299     /* user fonts */
   300 
   301     val fonts: Map[Symbol, String] =
   302       recode_map((
   303         for ((sym, props) <- symbols if props.isDefinedAt("font"))
   304           yield (sym -> props("font"))): _*)
   305 
   306     val font_names: List[String] = Set(fonts.toList.map(_._2): _*).toList
   307     val font_index: Map[String, Int] = Map((font_names zip (0 until font_names.length).toList): _*)
   308 
   309 
   310     /* classification */
   311 
   312     val letters = recode_set(
   313       "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M",
   314       "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z",
   315       "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m",
   316       "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z",
   317 
   318       "\\<A>", "\\<B>", "\\<C>", "\\<D>", "\\<E>", "\\<F>", "\\<G>",
   319       "\\<H>", "\\<I>", "\\<J>", "\\<K>", "\\<L>", "\\<M>", "\\<N>",
   320       "\\<O>", "\\<P>", "\\<Q>", "\\<R>", "\\<S>", "\\<T>", "\\<U>",
   321       "\\<V>", "\\<W>", "\\<X>", "\\<Y>", "\\<Z>", "\\<a>", "\\<b>",
   322       "\\<c>", "\\<d>", "\\<e>", "\\<f>", "\\<g>", "\\<h>", "\\<i>",
   323       "\\<j>", "\\<k>", "\\<l>", "\\<m>", "\\<n>", "\\<o>", "\\<p>",
   324       "\\<q>", "\\<r>", "\\<s>", "\\<t>", "\\<u>", "\\<v>", "\\<w>",
   325       "\\<x>", "\\<y>", "\\<z>",
   326 
   327       "\\<AA>", "\\<BB>", "\\<CC>", "\\<DD>", "\\<EE>", "\\<FF>",
   328       "\\<GG>", "\\<HH>", "\\<II>", "\\<JJ>", "\\<KK>", "\\<LL>",
   329       "\\<MM>", "\\<NN>", "\\<OO>", "\\<PP>", "\\<QQ>", "\\<RR>",
   330       "\\<SS>", "\\<TT>", "\\<UU>", "\\<VV>", "\\<WW>", "\\<XX>",
   331       "\\<YY>", "\\<ZZ>", "\\<aa>", "\\<bb>", "\\<cc>", "\\<dd>",
   332       "\\<ee>", "\\<ff>", "\\<gg>", "\\<hh>", "\\<ii>", "\\<jj>",
   333       "\\<kk>", "\\<ll>", "\\<mm>", "\\<nn>", "\\<oo>", "\\<pp>",
   334       "\\<qq>", "\\<rr>", "\\<ss>", "\\<tt>", "\\<uu>", "\\<vv>",
   335       "\\<ww>", "\\<xx>", "\\<yy>", "\\<zz>",
   336 
   337       "\\<alpha>", "\\<beta>", "\\<gamma>", "\\<delta>", "\\<epsilon>",
   338       "\\<zeta>", "\\<eta>", "\\<theta>", "\\<iota>", "\\<kappa>",
   339       "\\<mu>", "\\<nu>", "\\<xi>", "\\<pi>", "\\<rho>", "\\<sigma>",
   340       "\\<tau>", "\\<upsilon>", "\\<phi>", "\\<chi>", "\\<psi>",
   341       "\\<omega>", "\\<Gamma>", "\\<Delta>", "\\<Theta>", "\\<Lambda>",
   342       "\\<Xi>", "\\<Pi>", "\\<Sigma>", "\\<Upsilon>", "\\<Phi>",
   343       "\\<Psi>", "\\<Omega>",
   344 
   345       "\\<^isub>", "\\<^isup>")
   346 
   347     val blanks =
   348       recode_set(space, "\t", "\n", "\u000B", "\f", "\r", "\\<spacespace>", "\\<^newline>")
   349 
   350     val sym_chars =
   351       Set("!", "#", "$", "%", "&", "*", "+", "-", "/", "<", "=", ">", "?", "@", "^", "_", "|", "~")
   352 
   353 
   354     /* control symbols */
   355 
   356     val ctrl_decoded: Set[Symbol] =
   357       Set((for ((sym, _) <- symbols if sym.startsWith("\\<^")) yield decode(sym)): _*)
   358 
   359     val subscript_decoded = Set(decode("\\<^sub>"), decode("\\<^isub>"))
   360     val superscript_decoded = Set(decode("\\<^sup>"), decode("\\<^isup>"))
   361 
   362     val bold_decoded = decode("\\<^bold>")
   363     val bsub_decoded = decode("\\<^bsub>")
   364     val esub_decoded = decode("\\<^esub>")
   365     val bsup_decoded = decode("\\<^bsup>")
   366     val esup_decoded = decode("\\<^esup>")
   367   }
   368 
   369 
   370   /* tables */
   371 
   372   def names: Map[Symbol, String] = symbols.names
   373   def abbrevs: Map[Symbol, String] = symbols.abbrevs
   374 
   375   def decode(text: String): String = symbols.decode(text)
   376   def encode(text: String): String = symbols.encode(text)
   377 
   378   def fonts: Map[Symbol, String] = symbols.fonts
   379   def font_names: List[String] = symbols.font_names
   380   def font_index: Map[String, Int] = symbols.font_index
   381   def lookup_font(sym: Symbol): Option[Int] = symbols.fonts.get(sym).map(font_index(_))
   382 
   383 
   384   /* classification */
   385 
   386   def is_letter(sym: Symbol): Boolean = symbols.letters.contains(sym)
   387   def is_digit(sym: Symbol): Boolean = sym.length == 1 && '0' <= sym(0) && sym(0) <= '9'
   388   def is_quasi(sym: Symbol): Boolean = sym == "_" || sym == "'"
   389   def is_letdig(sym: Symbol): Boolean = is_letter(sym) || is_digit(sym) || is_quasi(sym)
   390   def is_blank(sym: Symbol): Boolean = symbols.blanks.contains(sym)
   391   def is_symbolic_char(sym: Symbol): Boolean = symbols.sym_chars.contains(sym)
   392   def is_symbolic(sym: Symbol): Boolean =
   393     sym.startsWith("\\<") && sym.endsWith(">") && !sym.startsWith("\\<^")
   394 
   395 
   396   /* control symbols */
   397 
   398   def is_ctrl(sym: Symbol): Boolean =
   399     sym.startsWith("\\<^") || symbols.ctrl_decoded.contains(sym)
   400 
   401   def is_controllable(sym: Symbol): Boolean =
   402     !is_blank(sym) && !is_ctrl(sym) && !is_malformed(sym)
   403 
   404   def is_subscript_decoded(sym: Symbol): Boolean = symbols.subscript_decoded.contains(sym)
   405   def is_superscript_decoded(sym: Symbol): Boolean = symbols.superscript_decoded.contains(sym)
   406   def is_bold_decoded(sym: Symbol): Boolean = sym == symbols.bold_decoded
   407   def is_bsub_decoded(sym: Symbol): Boolean = sym == symbols.bsub_decoded
   408   def is_esub_decoded(sym: Symbol): Boolean = sym == symbols.esub_decoded
   409   def is_bsup_decoded(sym: Symbol): Boolean = sym == symbols.bsup_decoded
   410   def is_esup_decoded(sym: Symbol): Boolean = sym == symbols.esup_decoded
   411 }