src/Pure/General/symbol.scala
author wenzelm
Sun, 06 Dec 2009 23:06:53 +0100
changeset 34010 6e5eafb373b3
parent 34007 fc56cfc6906e
child 34104 2b9cdf23c188
permissions -rw-r--r--
elements: more convenient result;
     1 /*  Title:      Pure/General/symbol.scala
     2     Author:     Makarius
     3 
     4 Detecting and recoding Isabelle symbols.
     5 */
     6 
     7 package isabelle
     8 
     9 import scala.io.Source
    10 import scala.collection.{jcl, mutable}
    11 import scala.util.matching.Regex
    12 
    13 
    14 object Symbol
    15 {
    16   /* Symbol regexps */
    17 
    18   private val plain = new Regex("""(?xs)
    19     [^\\ \ud800-\udfff] | [\ud800-\udbff][\udc00-\udfff] """)
    20 
    21   private val symbol = new Regex("""(?xs)
    22       \\ < (?:
    23       \^? [A-Za-z][A-Za-z0-9_']* |
    24       \^raw: [\x20-\x7e\u0100-\uffff && [^.>]]* ) >""")
    25 
    26   private val bad_symbol = new Regex("(?xs) (?!" + symbol + ")" +
    27     """ \\ < (?: (?! \s | [\"`\\] | \(\* | \*\) | \{\* | \*\} ) . )*""")
    28 
    29   // total pattern
    30   val regex = new Regex(plain + "|" + symbol + "|" + bad_symbol + "| .")
    31 
    32   // prefix of another symbol
    33   def is_open(s: CharSequence): Boolean =
    34   {
    35     val len = s.length
    36     len == 1 && Character.isHighSurrogate(s.charAt(0)) ||
    37     s == "\\" ||
    38     s == "\\<" ||
    39     len > 2 && s.charAt(len - 1) != '>'
    40   }
    41 
    42 
    43   /* elements */
    44 
    45   private def could_open(c: Char): Boolean =
    46     c == '\\' || Character.isHighSurrogate(c)
    47 
    48   def elements(text: CharSequence) = new Iterator[String] {
    49     private val matcher = regex.pattern.matcher(text)
    50     private var i = 0
    51     def hasNext = i < text.length
    52     def next = {
    53       val len =
    54         if (could_open(text.charAt(i))) {
    55           matcher.region(i, text.length).lookingAt
    56           matcher.group.length
    57         }
    58         else 1
    59       val s = text.subSequence(i, i + len)
    60       i += len
    61       s.toString
    62     }
    63   }
    64 
    65 
    66   /* decoding offsets */
    67 
    68   class Index(text: CharSequence)
    69   {
    70     case class Entry(chr: Int, sym: Int)
    71     val index: Array[Entry] =
    72     {
    73       val matcher = regex.pattern.matcher(text)
    74       val buf = new mutable.ArrayBuffer[Entry]
    75       var chr = 0
    76       var sym = 0
    77       while (chr < text.length) {
    78         val len =
    79           if (could_open(text.charAt(chr))) {
    80             matcher.region(chr, text.length).lookingAt
    81             matcher.group.length
    82           }
    83           else 1
    84         chr += len
    85         sym += 1
    86         if (len > 1) buf += Entry(chr, sym)
    87       }
    88       buf.toArray
    89     }
    90     def decode(sym: Int): Int =
    91     {
    92       val end = index.length
    93       def bisect(a: Int, b: Int): Int =
    94       {
    95         if (a < b) {
    96           val c = (a + b) / 2
    97           if (sym < index(c).sym) bisect(a, c)
    98           else if (c + 1 == end || sym < index(c + 1).sym) c
    99           else bisect(c + 1, b)
   100         }
   101         else -1
   102       }
   103       val i = bisect(0, end)
   104       if (i < 0) sym
   105       else index(i).chr + sym - index(i).sym
   106     }
   107   }
   108 
   109 
   110   /* recoding text */
   111 
   112   private class Recoder(list: List[(String, String)])
   113   {
   114     private val (min, max) =
   115     {
   116       var min = '\uffff'
   117       var max = '\u0000'
   118       for ((x, _) <- list) {
   119         val c = x(0)
   120         if (c < min) min = c
   121         if (c > max) max = c
   122       }
   123       (min, max)
   124     }
   125     private val table =
   126     {
   127       val table = new jcl.HashMap[String, String]   // reasonably efficient?
   128       for ((x, y) <- list) table + (x -> y)
   129       table
   130     }
   131     def recode(text: String): String =
   132     {
   133       val len = text.length
   134       val matcher = regex.pattern.matcher(text)
   135       val result = new StringBuilder(len)
   136       var i = 0
   137       while (i < len) {
   138         val c = text(i)
   139         if (min <= c && c <= max) {
   140           matcher.region(i, len).lookingAt
   141           val x = matcher.group
   142           result.append(table.get(x) getOrElse x)
   143           i = matcher.end
   144         }
   145         else { result.append(c); i += 1 }
   146       }
   147       result.toString
   148     }
   149   }
   150 
   151 
   152 
   153   /** Symbol interpretation **/
   154 
   155   class Interpretation(symbol_decls: Iterator[String])
   156   {
   157     /* read symbols */
   158 
   159     private val empty = new Regex("""(?xs) ^\s* (?: \#.* )? $ """)
   160     private val key = new Regex("""(?xs) (.+): """)
   161 
   162     private def read_decl(decl: String): (String, Map[String, String]) =
   163     {
   164       def err() = error("Bad symbol declaration: " + decl)
   165 
   166       def read_props(props: List[String]): Map[String, String] =
   167       {
   168         props match {
   169           case Nil => Map()
   170           case _ :: Nil => err()
   171           case key(x) :: y :: rest => read_props(rest) + (x -> y)
   172           case _ => err()
   173         }
   174       }
   175       decl.split("\\s+").toList match {
   176         case Nil => err()
   177         case sym :: props => (sym, read_props(props))
   178       }
   179     }
   180 
   181     private val symbols: List[(String, Map[String, String])] =
   182       for (decl <- symbol_decls.toList if !empty.pattern.matcher(decl).matches)
   183         yield read_decl(decl)
   184 
   185 
   186     /* misc properties */
   187 
   188     val names: Map[String, String] = {
   189       val name = new Regex("""\\<([A-Za-z][A-Za-z0-9_']*)>""")
   190       Map((for ((sym @ name(a), _) <- symbols) yield (sym -> a)): _*)
   191     }
   192 
   193     val abbrevs: Map[String, String] = Map((
   194       for ((sym, props) <- symbols if props.isDefinedAt("abbrev"))
   195         yield (sym -> props("abbrev"))): _*)
   196 
   197 
   198     /* main recoder methods */
   199 
   200     private val (decoder, encoder) =
   201     {
   202       val mapping =
   203         for {
   204           (sym, props) <- symbols
   205           val code =
   206             try { Integer.decode(props("code")).intValue }
   207             catch {
   208               case _: NoSuchElementException => error("Missing code for symbol " + sym)
   209               case _: NumberFormatException => error("Bad code for symbol " + sym)
   210             }
   211           val ch = new String(Character.toChars(code))
   212         } yield (sym, ch)
   213       (new Recoder(mapping),
   214        new Recoder(mapping map { case (x, y) => (y, x) }))
   215     }
   216 
   217     def decode(text: String) = decoder.recode(text)
   218     def encode(text: String) = encoder.recode(text)
   219   }
   220 }