src/Pure/General/symbol.scala
author wenzelm
Sat, 18 Jun 2011 18:57:38 +0200
changeset 44318 0ef3ec385b2b
parent 44314 5d9693c2337e
child 44330 4b4b93672f15
permissions -rw-r--r--
do not control malformed symbols;
     1 /*  Title:      Pure/General/symbol.scala
     2     Author:     Makarius
     3 
     4 Detecting and recoding Isabelle symbols.
     5 */
     6 
     7 package isabelle
     8 
     9 import scala.io.Source
    10 import scala.collection.mutable
    11 import scala.util.matching.Regex
    12 
    13 
    14 object Symbol
    15 {
    16   /* spaces */
    17 
    18   val spc = ' '
    19   val space = " "
    20 
    21   private val static_spaces = space * 4000
    22 
    23   def spaces(k: Int): String =
    24   {
    25     require(k >= 0)
    26     if (k < static_spaces.length) static_spaces.substring(0, k)
    27     else space * k
    28   }
    29 
    30 
    31   /* ASCII characters */
    32 
    33   def is_ascii_letter(c: Char): Boolean = 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z'
    34   def is_ascii_digit(c: Char): Boolean = '0' <= c && c <= '9'
    35   def is_ascii_quasi(c: Char): Boolean = c == '_' || c == '\''
    36 
    37   def is_ascii_letdig(c: Char): Boolean =
    38     is_ascii_letter(c) || is_ascii_digit(c) || is_ascii_quasi(c)
    39 
    40   def is_ascii_identifier(s: String): Boolean =
    41     s.length > 0 && is_ascii_letter(s(0)) && s.substring(1).forall(is_ascii_letdig)
    42 
    43 
    44   /* Symbol regexps */
    45 
    46   private val plain = new Regex("""(?xs)
    47       [^\r\\\ud800-\udfff\ufffd] | [\ud800-\udbff][\udc00-\udfff] """)
    48 
    49   private val physical_newline = new Regex("""(?xs) \n | \r\n | \r """)
    50 
    51   private val symbol = new Regex("""(?xs)
    52       \\ < (?:
    53       \^? [A-Za-z][A-Za-z0-9_']* |
    54       \^raw: [\x20-\x7e\u0100-\uffff && [^.>]]* ) >""")
    55 
    56   private val malformed_symbol = new Regex("(?xs) (?!" + symbol + ")" +
    57     """ [\ud800-\udbff\ufffd] | \\<\^? """)
    58 
    59   val regex_total =
    60     new Regex(plain + "|" + physical_newline + "|" + symbol + "|" + malformed_symbol + "| .")
    61 
    62 
    63   /* basic matching */
    64 
    65   def is_plain(c: Char): Boolean = !(c == '\r' || c == '\\' || '\ud800' <= c && c <= '\udfff')
    66 
    67   def is_physical_newline(s: CharSequence): Boolean =
    68     "\n".contentEquals(s) || "\r".contentEquals(s) || "\r\n".contentEquals(s)
    69 
    70   def is_malformed(s: CharSequence): Boolean =
    71     !(s.length == 1 && is_plain(s.charAt(0))) && malformed_symbol.pattern.matcher(s).matches
    72 
    73   class Matcher(text: CharSequence)
    74   {
    75     private val matcher = regex_total.pattern.matcher(text)
    76     def apply(start: Int, end: Int): Int =
    77     {
    78       require(0 <= start && start < end && end <= text.length)
    79       if (is_plain(text.charAt(start))) 1
    80       else {
    81         matcher.region(start, end).lookingAt
    82         matcher.group.length
    83       }
    84     }
    85   }
    86 
    87 
    88   /* iterator */
    89 
    90   def iterator(text: CharSequence) = new Iterator[CharSequence]
    91   {
    92     private val matcher = new Matcher(text)
    93     private var i = 0
    94     def hasNext = i < text.length
    95     def next =
    96     {
    97       val n = matcher(i, text.length)
    98       val s = text.subSequence(i, i + n)
    99       i += n
   100       s
   101     }
   102   }
   103 
   104 
   105   /* decoding offsets */
   106 
   107   class Index(text: CharSequence)
   108   {
   109     case class Entry(chr: Int, sym: Int)
   110     val index: Array[Entry] =
   111     {
   112       val matcher = new Matcher(text)
   113       val buf = new mutable.ArrayBuffer[Entry]
   114       var chr = 0
   115       var sym = 0
   116       while (chr < text.length) {
   117         val n = matcher(chr, text.length)
   118         chr += n
   119         sym += 1
   120         if (n > 1) buf += Entry(chr, sym)
   121       }
   122       buf.toArray
   123     }
   124     def decode(sym1: Int): Int =
   125     {
   126       val sym = sym1 - 1
   127       val end = index.length
   128       def bisect(a: Int, b: Int): Int =
   129       {
   130         if (a < b) {
   131           val c = (a + b) / 2
   132           if (sym < index(c).sym) bisect(a, c)
   133           else if (c + 1 == end || sym < index(c + 1).sym) c
   134           else bisect(c + 1, b)
   135         }
   136         else -1
   137       }
   138       val i = bisect(0, end)
   139       if (i < 0) sym
   140       else index(i).chr + sym - index(i).sym
   141     }
   142     def decode(range: Text.Range): Text.Range = range.map(decode(_))
   143   }
   144 
   145 
   146   /* recoding text */
   147 
   148   private class Recoder(list: List[(String, String)])
   149   {
   150     private val (min, max) =
   151     {
   152       var min = '\uffff'
   153       var max = '\u0000'
   154       for ((x, _) <- list) {
   155         val c = x(0)
   156         if (c < min) min = c
   157         if (c > max) max = c
   158       }
   159       (min, max)
   160     }
   161     private val table =
   162     {
   163       var tab = Map[String, String]()
   164       for ((x, y) <- list) {
   165         tab.get(x) match {
   166           case None => tab += (x -> y)
   167           case Some(z) =>
   168             error("Duplicate mapping of \"" + x + "\" to \"" + y + "\" vs. \"" + z + "\"")
   169         }
   170       }
   171       tab
   172     }
   173     def recode(text: String): String =
   174     {
   175       val len = text.length
   176       val matcher = regex_total.pattern.matcher(text)
   177       val result = new StringBuilder(len)
   178       var i = 0
   179       while (i < len) {
   180         val c = text(i)
   181         if (min <= c && c <= max) {
   182           matcher.region(i, len).lookingAt
   183           val x = matcher.group
   184           result.append(table.get(x) getOrElse x)
   185           i = matcher.end
   186         }
   187         else { result.append(c); i += 1 }
   188       }
   189       result.toString
   190     }
   191   }
   192 
   193 
   194 
   195   /** Symbol interpretation **/
   196 
   197   class Interpretation(symbol_decls: List[String])
   198   {
   199     /* read symbols */
   200 
   201     private val empty = new Regex("""(?xs) ^\s* (?: \#.* )? $ """)
   202     private val key = new Regex("""(?xs) (.+): """)
   203 
   204     private def read_decl(decl: String): (String, Map[String, String]) =
   205     {
   206       def err() = error("Bad symbol declaration: " + decl)
   207 
   208       def read_props(props: List[String]): Map[String, String] =
   209       {
   210         props match {
   211           case Nil => Map()
   212           case _ :: Nil => err()
   213           case key(x) :: y :: rest => read_props(rest) + (x -> y)
   214           case _ => err()
   215         }
   216       }
   217       decl.split("\\s+").toList match {
   218         case sym :: props if sym.length > 1 && !is_malformed(sym) => (sym, read_props(props))
   219         case _ => err()
   220       }
   221     }
   222 
   223     private val symbols: List[(String, Map[String, String])] =
   224       Map((
   225         for (decl <- symbol_decls if !empty.pattern.matcher(decl).matches)
   226           yield read_decl(decl)): _*) toList
   227 
   228 
   229     /* misc properties */
   230 
   231     val names: Map[String, String] =
   232     {
   233       val name = new Regex("""\\<([A-Za-z][A-Za-z0-9_']*)>""")
   234       Map((for ((sym @ name(a), _) <- symbols) yield (sym -> a)): _*)
   235     }
   236 
   237     val abbrevs: Map[String, String] =
   238       Map((
   239         for ((sym, props) <- symbols if props.isDefinedAt("abbrev"))
   240           yield (sym -> props("abbrev"))): _*)
   241 
   242 
   243     /* main recoder methods */
   244 
   245     private val (decoder, encoder) =
   246     {
   247       val mapping =
   248         for {
   249           (sym, props) <- symbols
   250           val code =
   251             try { Integer.decode(props("code")).intValue }
   252             catch {
   253               case _: NoSuchElementException => error("Missing code for symbol " + sym)
   254               case _: NumberFormatException => error("Bad code for symbol " + sym)
   255             }
   256           val ch = new String(Character.toChars(code))
   257         } yield {
   258           if (code < 128) error("Illegal ASCII code for symbol " + sym)
   259           else (sym, ch)
   260         }
   261       (new Recoder(mapping),
   262        new Recoder(mapping map { case (x, y) => (y, x) }))
   263     }
   264 
   265     def decode(text: String): String = decoder.recode(text)
   266     def encode(text: String): String = encoder.recode(text)
   267 
   268 
   269     /* classification */
   270 
   271     private object Decode_Set
   272     {
   273       def apply(elems: String*): Set[String] =
   274       {
   275         val content = elems.toList
   276         Set((content ::: content.map(decode)): _*)
   277       }
   278     }
   279 
   280     private val letters = Decode_Set(
   281       "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M",
   282       "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z",
   283       "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m",
   284       "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z",
   285 
   286       "\\<A>", "\\<B>", "\\<C>", "\\<D>", "\\<E>", "\\<F>", "\\<G>",
   287       "\\<H>", "\\<I>", "\\<J>", "\\<K>", "\\<L>", "\\<M>", "\\<N>",
   288       "\\<O>", "\\<P>", "\\<Q>", "\\<R>", "\\<S>", "\\<T>", "\\<U>",
   289       "\\<V>", "\\<W>", "\\<X>", "\\<Y>", "\\<Z>", "\\<a>", "\\<b>",
   290       "\\<c>", "\\<d>", "\\<e>", "\\<f>", "\\<g>", "\\<h>", "\\<i>",
   291       "\\<j>", "\\<k>", "\\<l>", "\\<m>", "\\<n>", "\\<o>", "\\<p>",
   292       "\\<q>", "\\<r>", "\\<s>", "\\<t>", "\\<u>", "\\<v>", "\\<w>",
   293       "\\<x>", "\\<y>", "\\<z>",
   294 
   295       "\\<AA>", "\\<BB>", "\\<CC>", "\\<DD>", "\\<EE>", "\\<FF>",
   296       "\\<GG>", "\\<HH>", "\\<II>", "\\<JJ>", "\\<KK>", "\\<LL>",
   297       "\\<MM>", "\\<NN>", "\\<OO>", "\\<PP>", "\\<QQ>", "\\<RR>",
   298       "\\<SS>", "\\<TT>", "\\<UU>", "\\<VV>", "\\<WW>", "\\<XX>",
   299       "\\<YY>", "\\<ZZ>", "\\<aa>", "\\<bb>", "\\<cc>", "\\<dd>",
   300       "\\<ee>", "\\<ff>", "\\<gg>", "\\<hh>", "\\<ii>", "\\<jj>",
   301       "\\<kk>", "\\<ll>", "\\<mm>", "\\<nn>", "\\<oo>", "\\<pp>",
   302       "\\<qq>", "\\<rr>", "\\<ss>", "\\<tt>", "\\<uu>", "\\<vv>",
   303       "\\<ww>", "\\<xx>", "\\<yy>", "\\<zz>",
   304 
   305       "\\<alpha>", "\\<beta>", "\\<gamma>", "\\<delta>", "\\<epsilon>",
   306       "\\<zeta>", "\\<eta>", "\\<theta>", "\\<iota>", "\\<kappa>",
   307       "\\<mu>", "\\<nu>", "\\<xi>", "\\<pi>", "\\<rho>", "\\<sigma>",
   308       "\\<tau>", "\\<upsilon>", "\\<phi>", "\\<chi>", "\\<psi>",
   309       "\\<omega>", "\\<Gamma>", "\\<Delta>", "\\<Theta>", "\\<Lambda>",
   310       "\\<Xi>", "\\<Pi>", "\\<Sigma>", "\\<Upsilon>", "\\<Phi>",
   311       "\\<Psi>", "\\<Omega>",
   312 
   313       "\\<^isub>", "\\<^isup>")
   314 
   315     private val blanks =
   316       Decode_Set(space, "\t", "\n", "\u000B", "\f", "\r", "\\<spacespace>", "\\<^newline>")
   317 
   318     private val sym_chars =
   319       Set("!", "#", "$", "%", "&", "*", "+", "-", "/", "<", "=", ">", "?", "@", "^", "_", "|", "~")
   320 
   321     def is_letter(sym: String): Boolean = letters.contains(sym)
   322     def is_digit(sym: String): Boolean = sym.length == 1 && '0' <= sym(0) && sym(0) <= '9'
   323     def is_quasi(sym: String): Boolean = sym == "_" || sym == "'"
   324     def is_letdig(sym: String): Boolean = is_letter(sym) || is_digit(sym) || is_quasi(sym)
   325     def is_blank(sym: String): Boolean = blanks.contains(sym)
   326     def is_symbolic_char(sym: String): Boolean = sym_chars.contains(sym)
   327     def is_symbolic(sym: String): Boolean =
   328       sym.startsWith("\\<") && sym.endsWith(">") && !sym.startsWith("\\<^")
   329     def is_controllable(sym: String): Boolean =
   330       !is_blank(sym) && !sym.startsWith("\\<^") && !is_malformed(sym)
   331   }
   332 }