src/Pure/General/symbol.scala
author wenzelm
Sat, 13 Nov 2010 19:21:53 +0100
changeset 40769 1050315f6ee2
parent 40768 37b79d789d91
child 40770 6131d7a78ad3
permissions -rw-r--r--
simplified/robustified treatment of malformed symbols, which are now fully internalized (total Symbol.explode etc.);
allow malformed symbols inside quoted material, comments etc. -- for improved user experience with incremental re-parsing;
refined treatment of malformed surrogates (Scala);
     1 /*  Title:      Pure/General/symbol.scala
     2     Author:     Makarius
     3 
     4 Detecting and recoding Isabelle symbols.
     5 */
     6 
     7 package isabelle
     8 
     9 import scala.io.Source
    10 import scala.collection.mutable
    11 import scala.util.matching.Regex
    12 
    13 
    14 object Symbol
    15 {
    16   /* spaces */
    17 
    18   val spc = ' '
    19   val space = " "
    20 
    21   private val static_spaces = space * 4000
    22 
    23   def spaces(k: Int): String =
    24   {
    25     require(k >= 0)
    26     if (k < static_spaces.length) static_spaces.substring(0, k)
    27     else space * k
    28   }
    29 
    30 
    31   /* Symbol regexps */
    32 
    33   private val plain = new Regex("""(?xs)
    34       [^\r\\\ud800-\udfff] | [\ud800-\udbff][\udc00-\udfff] """)
    35 
    36   private val physical_newline = new Regex("""(?xs) \n | \r\n | \r """)
    37 
    38   private val symbol = new Regex("""(?xs)
    39       \\ < (?:
    40       \^? [A-Za-z][A-Za-z0-9_']* |
    41       \^raw: [\x20-\x7e\u0100-\uffff && [^.>]]* ) >""")
    42 
    43   private val malformed_symbol = new Regex("(?xs) (?!" + symbol + ")" +
    44     """ [\ud800-\udbff] | \\<^? """)
    45 
    46   val regex_total =
    47     new Regex(plain + "|" + physical_newline + "|" + symbol + "|" + malformed_symbol + "| .")
    48 
    49 
    50   /* basic matching */
    51 
    52   def is_plain(c: Char): Boolean = !(c == '\r' || c == '\\' || '\ud800' <= c && c <= '\udfff')
    53 
    54   def is_physical_newline(s: CharSequence): Boolean =
    55     "\n".contentEquals(s) || "\r".contentEquals(s) || "\r\n".contentEquals(s)
    56 
    57   def is_malformed(s: CharSequence): Boolean =
    58     !(s.length == 1 && is_plain(s.charAt(0))) && malformed_symbol.pattern.matcher(s).matches
    59 
    60   class Matcher(text: CharSequence)
    61   {
    62     private val matcher = regex_total.pattern.matcher(text)
    63     def apply(start: Int, end: Int): Int =
    64     {
    65       require(0 <= start && start < end && end <= text.length)
    66       if (is_plain(text.charAt(start))) 1
    67       else {
    68         matcher.region(start, end).lookingAt
    69         matcher.group.length
    70       }
    71     }
    72   }
    73 
    74 
    75   /* iterator */
    76 
    77   def iterator(text: CharSequence) = new Iterator[CharSequence]
    78   {
    79     private val matcher = new Matcher(text)
    80     private var i = 0
    81     def hasNext = i < text.length
    82     def next =
    83     {
    84       val n = matcher(i, text.length)
    85       val s = text.subSequence(i, i + n)
    86       i += n
    87       s
    88     }
    89   }
    90 
    91 
    92   /* decoding offsets */
    93 
    94   class Index(text: CharSequence)
    95   {
    96     case class Entry(chr: Int, sym: Int)
    97     val index: Array[Entry] =
    98     {
    99       val matcher = new Matcher(text)
   100       val buf = new mutable.ArrayBuffer[Entry]
   101       var chr = 0
   102       var sym = 0
   103       while (chr < text.length) {
   104         val n = matcher(chr, text.length)
   105         chr += n
   106         sym += 1
   107         if (n > 1) buf += Entry(chr, sym)
   108       }
   109       buf.toArray
   110     }
   111     def decode(sym1: Int): Int =
   112     {
   113       val sym = sym1 - 1
   114       val end = index.length
   115       def bisect(a: Int, b: Int): Int =
   116       {
   117         if (a < b) {
   118           val c = (a + b) / 2
   119           if (sym < index(c).sym) bisect(a, c)
   120           else if (c + 1 == end || sym < index(c + 1).sym) c
   121           else bisect(c + 1, b)
   122         }
   123         else -1
   124       }
   125       val i = bisect(0, end)
   126       if (i < 0) sym
   127       else index(i).chr + sym - index(i).sym
   128     }
   129     def decode(range: Text.Range): Text.Range = range.map(decode(_))
   130   }
   131 
   132 
   133   /* recoding text */
   134 
   135   private class Recoder(list: List[(String, String)])
   136   {
   137     private val (min, max) =
   138     {
   139       var min = '\uffff'
   140       var max = '\u0000'
   141       for ((x, _) <- list) {
   142         val c = x(0)
   143         if (c < min) min = c
   144         if (c > max) max = c
   145       }
   146       (min, max)
   147     }
   148     private val table =
   149     {
   150       var tab = Map[String, String]()
   151       for ((x, y) <- list) {
   152         tab.get(x) match {
   153           case None => tab += (x -> y)
   154           case Some(z) =>
   155             error("Duplicate mapping of \"" + x + "\" to \"" + y + "\" vs. \"" + z + "\"")
   156         }
   157       }
   158       tab
   159     }
   160     def recode(text: String): String =
   161     {
   162       val len = text.length
   163       val matcher = regex_total.pattern.matcher(text)
   164       val result = new StringBuilder(len)
   165       var i = 0
   166       while (i < len) {
   167         val c = text(i)
   168         if (min <= c && c <= max) {
   169           matcher.region(i, len).lookingAt
   170           val x = matcher.group
   171           result.append(table.get(x) getOrElse x)
   172           i = matcher.end
   173         }
   174         else { result.append(c); i += 1 }
   175       }
   176       result.toString
   177     }
   178   }
   179 
   180 
   181 
   182   /** Symbol interpretation **/
   183 
   184   class Interpretation(symbol_decls: List[String])
   185   {
   186     /* read symbols */
   187 
   188     private val empty = new Regex("""(?xs) ^\s* (?: \#.* )? $ """)
   189     private val key = new Regex("""(?xs) (.+): """)
   190 
   191     private def read_decl(decl: String): (String, Map[String, String]) =
   192     {
   193       def err() = error("Bad symbol declaration: " + decl)
   194 
   195       def read_props(props: List[String]): Map[String, String] =
   196       {
   197         props match {
   198           case Nil => Map()
   199           case _ :: Nil => err()
   200           case key(x) :: y :: rest => read_props(rest) + (x -> y)
   201           case _ => err()
   202         }
   203       }
   204       decl.split("\\s+").toList match {
   205         case sym :: props if sym.length > 1 && !is_malformed(sym) => (sym, read_props(props))
   206         case _ => err()
   207       }
   208     }
   209 
   210     private val symbols: List[(String, Map[String, String])] =
   211       Map((
   212         for (decl <- symbol_decls if !empty.pattern.matcher(decl).matches)
   213           yield read_decl(decl)): _*) toList
   214 
   215 
   216     /* misc properties */
   217 
   218     val names: Map[String, String] =
   219     {
   220       val name = new Regex("""\\<([A-Za-z][A-Za-z0-9_']*)>""")
   221       Map((for ((sym @ name(a), _) <- symbols) yield (sym -> a)): _*)
   222     }
   223 
   224     val abbrevs: Map[String, String] =
   225       Map((
   226         for ((sym, props) <- symbols if props.isDefinedAt("abbrev"))
   227           yield (sym -> props("abbrev"))): _*)
   228 
   229 
   230     /* main recoder methods */
   231 
   232     private val (decoder, encoder) =
   233     {
   234       val mapping =
   235         for {
   236           (sym, props) <- symbols
   237           val code =
   238             try { Integer.decode(props("code")).intValue }
   239             catch {
   240               case _: NoSuchElementException => error("Missing code for symbol " + sym)
   241               case _: NumberFormatException => error("Bad code for symbol " + sym)
   242             }
   243           val ch = new String(Character.toChars(code))
   244         } yield {
   245           if (code < 128) error("Illegal ASCII code for symbol " + sym)
   246           else (sym, ch)
   247         }
   248       (new Recoder(mapping),
   249        new Recoder(mapping map { case (x, y) => (y, x) }))
   250     }
   251 
   252     def decode(text: String): String = decoder.recode(text)
   253     def encode(text: String): String = encoder.recode(text)
   254 
   255 
   256     /* classification */
   257 
   258     private object Decode_Set
   259     {
   260       def apply(elems: String*): Set[String] =
   261       {
   262         val content = elems.toList
   263         Set((content ::: content.map(decode)): _*)
   264       }
   265     }
   266 
   267     private val letters = Decode_Set(
   268       "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M",
   269       "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z",
   270       "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m",
   271       "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z",
   272 
   273       "\\<A>", "\\<B>", "\\<C>", "\\<D>", "\\<E>", "\\<F>", "\\<G>",
   274       "\\<H>", "\\<I>", "\\<J>", "\\<K>", "\\<L>", "\\<M>", "\\<N>",
   275       "\\<O>", "\\<P>", "\\<Q>", "\\<R>", "\\<S>", "\\<T>", "\\<U>",
   276       "\\<V>", "\\<W>", "\\<X>", "\\<Y>", "\\<Z>", "\\<a>", "\\<b>",
   277       "\\<c>", "\\<d>", "\\<e>", "\\<f>", "\\<g>", "\\<h>", "\\<i>",
   278       "\\<j>", "\\<k>", "\\<l>", "\\<m>", "\\<n>", "\\<o>", "\\<p>",
   279       "\\<q>", "\\<r>", "\\<s>", "\\<t>", "\\<u>", "\\<v>", "\\<w>",
   280       "\\<x>", "\\<y>", "\\<z>",
   281 
   282       "\\<AA>", "\\<BB>", "\\<CC>", "\\<DD>", "\\<EE>", "\\<FF>",
   283       "\\<GG>", "\\<HH>", "\\<II>", "\\<JJ>", "\\<KK>", "\\<LL>",
   284       "\\<MM>", "\\<NN>", "\\<OO>", "\\<PP>", "\\<QQ>", "\\<RR>",
   285       "\\<SS>", "\\<TT>", "\\<UU>", "\\<VV>", "\\<WW>", "\\<XX>",
   286       "\\<YY>", "\\<ZZ>", "\\<aa>", "\\<bb>", "\\<cc>", "\\<dd>",
   287       "\\<ee>", "\\<ff>", "\\<gg>", "\\<hh>", "\\<ii>", "\\<jj>",
   288       "\\<kk>", "\\<ll>", "\\<mm>", "\\<nn>", "\\<oo>", "\\<pp>",
   289       "\\<qq>", "\\<rr>", "\\<ss>", "\\<tt>", "\\<uu>", "\\<vv>",
   290       "\\<ww>", "\\<xx>", "\\<yy>", "\\<zz>",
   291 
   292       "\\<alpha>", "\\<beta>", "\\<gamma>", "\\<delta>", "\\<epsilon>",
   293       "\\<zeta>", "\\<eta>", "\\<theta>", "\\<iota>", "\\<kappa>",
   294       "\\<mu>", "\\<nu>", "\\<xi>", "\\<pi>", "\\<rho>", "\\<sigma>",
   295       "\\<tau>", "\\<upsilon>", "\\<phi>", "\\<chi>", "\\<psi>",
   296       "\\<omega>", "\\<Gamma>", "\\<Delta>", "\\<Theta>", "\\<Lambda>",
   297       "\\<Xi>", "\\<Pi>", "\\<Sigma>", "\\<Upsilon>", "\\<Phi>",
   298       "\\<Psi>", "\\<Omega>",
   299 
   300       "\\<^isub>", "\\<^isup>")
   301 
   302     private val blanks =
   303       Decode_Set(space, "\t", "\n", "\u000B", "\f", "\r", "\\<spacespace>", "\\<^newline>")
   304 
   305     private val sym_chars =
   306       Set("!", "#", "$", "%", "&", "*", "+", "-", "/", "<", "=", ">", "?", "@", "^", "_", "|", "~")
   307 
   308     def is_letter(sym: String): Boolean = letters.contains(sym)
   309     def is_digit(sym: String): Boolean = sym.length == 1 && '0' <= sym(0) && sym(0) <= '9'
   310     def is_quasi(sym: String): Boolean = sym == "_" || sym == "'"
   311     def is_letdig(sym: String): Boolean = is_letter(sym) || is_digit(sym) || is_quasi(sym)
   312     def is_blank(sym: String): Boolean = blanks.contains(sym)
   313     def is_symbolic_char(sym: String): Boolean = sym_chars.contains(sym)
   314     def is_symbolic(sym: String): Boolean =
   315       sym.startsWith("\\<") && sym.endsWith(">") && !sym.startsWith("\\<^")
   316   }
   317 }