src/Pure/General/symbol.scala
author wenzelm
Sat, 19 Dec 2009 11:45:14 +0100
changeset 34143 d8d9df8407f6
parent 34104 2b9cdf23c188
child 34146 6cc9a0cbaf55
permissions -rw-r--r--
added symbol classification;
tuned;
wenzelm@27901
     1
/*  Title:      Pure/General/symbol.scala
wenzelm@27901
     2
    Author:     Makarius
wenzelm@27901
     3
wenzelm@27924
     4
Detecting and recoding Isabelle symbols.
wenzelm@27901
     5
*/
wenzelm@27901
     6
wenzelm@27901
     7
package isabelle
wenzelm@27901
     8
wenzelm@27918
     9
import scala.io.Source
wenzelm@31939
    10
import scala.collection.{jcl, mutable}
wenzelm@31537
    11
import scala.util.matching.Regex
wenzelm@27901
    12
wenzelm@27901
    13
wenzelm@31537
    14
object Symbol
wenzelm@31537
    15
{
wenzelm@34007
    16
  /* Symbol regexps */
wenzelm@27901
    17
wenzelm@31537
    18
  private val plain = new Regex("""(?xs)
wenzelm@31537
    19
    [^\\ \ud800-\udfff] | [\ud800-\udbff][\udc00-\udfff] """)
wenzelm@27901
    20
wenzelm@31537
    21
  private val symbol = new Regex("""(?xs)
wenzelm@31548
    22
      \\ < (?:
wenzelm@27924
    23
      \^? [A-Za-z][A-Za-z0-9_']* |
wenzelm@27924
    24
      \^raw: [\x20-\x7e\u0100-\uffff && [^.>]]* ) >""")
wenzelm@27923
    25
wenzelm@31537
    26
  private val bad_symbol = new Regex("(?xs) (?!" + symbol + ")" +
wenzelm@31548
    27
    """ \\ < (?: (?! \s | [\"`\\] | \(\* | \*\) | \{\* | \*\} ) . )*""")
wenzelm@27923
    28
wenzelm@27939
    29
  // total pattern
wenzelm@31537
    30
  val regex = new Regex(plain + "|" + symbol + "|" + bad_symbol + "| .")
wenzelm@27918
    31
wenzelm@31537
    32
  // prefix of another symbol
wenzelm@34007
    33
  def is_open(s: CharSequence): Boolean =
wenzelm@31537
    34
  {
wenzelm@31537
    35
    val len = s.length
wenzelm@34007
    36
    len == 1 && Character.isHighSurrogate(s.charAt(0)) ||
wenzelm@31537
    37
    s == "\\" ||
wenzelm@31537
    38
    s == "\\<" ||
wenzelm@34143
    39
    len > 2 && s.charAt(len - 1) != '>'   // FIXME bad_symbol !??
wenzelm@31537
    40
  }
wenzelm@27901
    41
wenzelm@27901
    42
wenzelm@34007
    43
  /* elements */
wenzelm@31939
    44
wenzelm@34143
    45
  def could_open(c: Char): Boolean =
wenzelm@34007
    46
    c == '\\' || Character.isHighSurrogate(c)
wenzelm@34007
    47
wenzelm@34143
    48
  def elements(text: CharSequence) = new Iterator[String]
wenzelm@34143
    49
  {
wenzelm@34007
    50
    private val matcher = regex.pattern.matcher(text)
wenzelm@34007
    51
    private var i = 0
wenzelm@34007
    52
    def hasNext = i < text.length
wenzelm@34007
    53
    def next = {
wenzelm@34007
    54
      val len =
wenzelm@34007
    55
        if (could_open(text.charAt(i))) {
wenzelm@34007
    56
          matcher.region(i, text.length).lookingAt
wenzelm@34007
    57
          matcher.group.length
wenzelm@34007
    58
        }
wenzelm@34007
    59
        else 1
wenzelm@34007
    60
      val s = text.subSequence(i, i + len)
wenzelm@34007
    61
      i += len
wenzelm@34010
    62
      s.toString
wenzelm@34007
    63
    }
wenzelm@34007
    64
  }
wenzelm@34007
    65
wenzelm@34007
    66
wenzelm@34007
    67
  /* decoding offsets */
wenzelm@34007
    68
wenzelm@34007
    69
  class Index(text: CharSequence)
wenzelm@31939
    70
  {
wenzelm@31939
    71
    case class Entry(chr: Int, sym: Int)
wenzelm@31939
    72
    val index: Array[Entry] =
wenzelm@31939
    73
    {
wenzelm@31939
    74
      val matcher = regex.pattern.matcher(text)
wenzelm@31939
    75
      val buf = new mutable.ArrayBuffer[Entry]
wenzelm@31939
    76
      var chr = 0
wenzelm@31939
    77
      var sym = 0
wenzelm@34007
    78
      while (chr < text.length) {
wenzelm@31939
    79
        val len =
wenzelm@34007
    80
          if (could_open(text.charAt(chr))) {
wenzelm@34007
    81
            matcher.region(chr, text.length).lookingAt
wenzelm@31939
    82
            matcher.group.length
wenzelm@34007
    83
          }
wenzelm@34007
    84
          else 1
wenzelm@31939
    85
        chr += len
wenzelm@31939
    86
        sym += 1
wenzelm@31939
    87
        if (len > 1) buf += Entry(chr, sym)
wenzelm@31939
    88
      }
wenzelm@31939
    89
      buf.toArray
wenzelm@31939
    90
    }
wenzelm@31939
    91
    def decode(sym: Int): Int =
wenzelm@31939
    92
    {
wenzelm@31939
    93
      val end = index.length
wenzelm@31939
    94
      def bisect(a: Int, b: Int): Int =
wenzelm@31939
    95
      {
wenzelm@31939
    96
        if (a < b) {
wenzelm@31939
    97
          val c = (a + b) / 2
wenzelm@31939
    98
          if (sym < index(c).sym) bisect(a, c)
wenzelm@31939
    99
          else if (c + 1 == end || sym < index(c + 1).sym) c
wenzelm@31939
   100
          else bisect(c + 1, b)
wenzelm@31939
   101
        }
wenzelm@31939
   102
        else -1
wenzelm@31939
   103
      }
wenzelm@31939
   104
      val i = bisect(0, end)
wenzelm@31939
   105
      if (i < 0) sym
wenzelm@31939
   106
      else index(i).chr + sym - index(i).sym
wenzelm@31939
   107
    }
wenzelm@31939
   108
  }
wenzelm@31939
   109
wenzelm@31939
   110
wenzelm@34007
   111
  /* recoding text */
wenzelm@27924
   112
wenzelm@31537
   113
  private class Recoder(list: List[(String, String)])
wenzelm@31537
   114
  {
wenzelm@31537
   115
    private val (min, max) =
wenzelm@31537
   116
    {
wenzelm@27937
   117
      var min = '\uffff'
wenzelm@27937
   118
      var max = '\u0000'
wenzelm@27937
   119
      for ((x, _) <- list) {
wenzelm@27937
   120
        val c = x(0)
wenzelm@27937
   121
        if (c < min) min = c
wenzelm@27937
   122
        if (c > max) max = c
wenzelm@27937
   123
      }
wenzelm@27937
   124
      (min, max)
wenzelm@27937
   125
    }
wenzelm@31537
   126
    private val table =
wenzelm@31537
   127
    {
wenzelm@31537
   128
      val table = new jcl.HashMap[String, String]   // reasonably efficient?
wenzelm@27937
   129
      for ((x, y) <- list) table + (x -> y)
wenzelm@27927
   130
      table
wenzelm@27927
   131
    }
wenzelm@31537
   132
    def recode(text: String): String =
wenzelm@31537
   133
    {
wenzelm@27937
   134
      val len = text.length
wenzelm@31537
   135
      val matcher = regex.pattern.matcher(text)
wenzelm@27937
   136
      val result = new StringBuilder(len)
wenzelm@27937
   137
      var i = 0
wenzelm@27937
   138
      while (i < len) {
wenzelm@27937
   139
        val c = text(i)
wenzelm@27937
   140
        if (min <= c && c <= max) {
wenzelm@31939
   141
          matcher.region(i, len).lookingAt
wenzelm@27938
   142
          val x = matcher.group
wenzelm@31537
   143
          result.append(table.get(x) getOrElse x)
wenzelm@27937
   144
          i = matcher.end
wenzelm@27937
   145
        }
wenzelm@27937
   146
        else { result.append(c); i += 1 }
wenzelm@27937
   147
      }
wenzelm@27937
   148
      result.toString
wenzelm@27924
   149
    }
wenzelm@27924
   150
  }
wenzelm@27918
   151
wenzelm@27918
   152
wenzelm@27937
   153
wenzelm@27937
   154
  /** Symbol interpretation **/
wenzelm@27937
   155
wenzelm@29569
   156
  class Interpretation(symbol_decls: Iterator[String])
wenzelm@29569
   157
  {
wenzelm@31537
   158
    /* read symbols */
wenzelm@31537
   159
wenzelm@31537
   160
    private val empty = new Regex("""(?xs) ^\s* (?: \#.* )? $ """)
wenzelm@31537
   161
    private val key = new Regex("""(?xs) (.+): """)
wenzelm@31537
   162
wenzelm@31537
   163
    private def read_decl(decl: String): (String, Map[String, String]) =
wenzelm@31537
   164
    {
wenzelm@31537
   165
      def err() = error("Bad symbol declaration: " + decl)
wenzelm@31537
   166
wenzelm@31537
   167
      def read_props(props: List[String]): Map[String, String] =
wenzelm@31537
   168
      {
wenzelm@31537
   169
        props match {
wenzelm@31537
   170
          case Nil => Map()
wenzelm@31537
   171
          case _ :: Nil => err()
wenzelm@31537
   172
          case key(x) :: y :: rest => read_props(rest) + (x -> y)
wenzelm@31537
   173
          case _ => err()
wenzelm@31537
   174
        }
wenzelm@31537
   175
      }
wenzelm@31537
   176
      decl.split("\\s+").toList match {
wenzelm@31537
   177
        case Nil => err()
wenzelm@31537
   178
        case sym :: props => (sym, read_props(props))
wenzelm@31537
   179
      }
wenzelm@31537
   180
    }
wenzelm@31537
   181
wenzelm@31537
   182
    private val symbols: List[(String, Map[String, String])] =
wenzelm@31537
   183
      for (decl <- symbol_decls.toList if !empty.pattern.matcher(decl).matches)
wenzelm@31537
   184
        yield read_decl(decl)
wenzelm@31537
   185
wenzelm@31537
   186
wenzelm@31651
   187
    /* misc properties */
wenzelm@31651
   188
wenzelm@34143
   189
    val names: Map[String, String] =
wenzelm@34143
   190
    {
wenzelm@31651
   191
      val name = new Regex("""\\<([A-Za-z][A-Za-z0-9_']*)>""")
wenzelm@31651
   192
      Map((for ((sym @ name(a), _) <- symbols) yield (sym -> a)): _*)
wenzelm@31651
   193
    }
wenzelm@31651
   194
wenzelm@31651
   195
    val abbrevs: Map[String, String] = Map((
wenzelm@31651
   196
      for ((sym, props) <- symbols if props.isDefinedAt("abbrev"))
wenzelm@31651
   197
        yield (sym -> props("abbrev"))): _*)
wenzelm@31651
   198
wenzelm@31651
   199
wenzelm@31537
   200
    /* main recoder methods */
wenzelm@31537
   201
wenzelm@31537
   202
    private val (decoder, encoder) =
wenzelm@31537
   203
    {
wenzelm@31537
   204
      val mapping =
wenzelm@31537
   205
        for {
wenzelm@31537
   206
          (sym, props) <- symbols
wenzelm@31537
   207
          val code =
wenzelm@31537
   208
            try { Integer.decode(props("code")).intValue }
wenzelm@31537
   209
            catch {
wenzelm@31537
   210
              case _: NoSuchElementException => error("Missing code for symbol " + sym)
wenzelm@31537
   211
              case _: NumberFormatException => error("Bad code for symbol " + sym)
wenzelm@31537
   212
            }
wenzelm@31537
   213
          val ch = new String(Character.toChars(code))
wenzelm@31537
   214
        } yield (sym, ch)
wenzelm@31548
   215
      (new Recoder(mapping),
wenzelm@31551
   216
       new Recoder(mapping map { case (x, y) => (y, x) }))
wenzelm@31537
   217
    }
wenzelm@27918
   218
wenzelm@34104
   219
    def decode(text: String): String = decoder.recode(text)
wenzelm@34104
   220
    def encode(text: String): String = encoder.recode(text)
wenzelm@34143
   221
wenzelm@34143
   222
wenzelm@34143
   223
    /* classification */
wenzelm@34143
   224
wenzelm@34143
   225
    private val raw_letters = Set(
wenzelm@34143
   226
      "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M",
wenzelm@34143
   227
      "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z",
wenzelm@34143
   228
      "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m",
wenzelm@34143
   229
      "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z",
wenzelm@34143
   230
wenzelm@34143
   231
      "\\<A>", "\\<B>", "\\<C>", "\\<D>", "\\<E>", "\\<F>", "\\<G>",
wenzelm@34143
   232
      "\\<H>", "\\<I>", "\\<J>", "\\<K>", "\\<L>", "\\<M>", "\\<N>",
wenzelm@34143
   233
      "\\<O>", "\\<P>", "\\<Q>", "\\<R>", "\\<S>", "\\<T>", "\\<U>",
wenzelm@34143
   234
      "\\<V>", "\\<W>", "\\<X>", "\\<Y>", "\\<Z>", "\\<a>", "\\<b>",
wenzelm@34143
   235
      "\\<c>", "\\<d>", "\\<e>", "\\<f>", "\\<g>", "\\<h>", "\\<i>",
wenzelm@34143
   236
      "\\<j>", "\\<k>", "\\<l>", "\\<m>", "\\<n>", "\\<o>", "\\<p>",
wenzelm@34143
   237
      "\\<q>", "\\<r>", "\\<s>", "\\<t>", "\\<u>", "\\<v>", "\\<w>",
wenzelm@34143
   238
      "\\<x>", "\\<y>", "\\<z>",
wenzelm@34143
   239
wenzelm@34143
   240
      "\\<AA>", "\\<BB>", "\\<CC>", "\\<DD>", "\\<EE>", "\\<FF>",
wenzelm@34143
   241
      "\\<GG>", "\\<HH>", "\\<II>", "\\<JJ>", "\\<KK>", "\\<LL>",
wenzelm@34143
   242
      "\\<MM>", "\\<NN>", "\\<OO>", "\\<PP>", "\\<QQ>", "\\<RR>",
wenzelm@34143
   243
      "\\<SS>", "\\<TT>", "\\<UU>", "\\<VV>", "\\<WW>", "\\<XX>",
wenzelm@34143
   244
      "\\<YY>", "\\<ZZ>", "\\<aa>", "\\<bb>", "\\<cc>", "\\<dd>",
wenzelm@34143
   245
      "\\<ee>", "\\<ff>", "\\<gg>", "\\<hh>", "\\<ii>", "\\<jj>",
wenzelm@34143
   246
      "\\<kk>", "\\<ll>", "\\<mm>", "\\<nn>", "\\<oo>", "\\<pp>",
wenzelm@34143
   247
      "\\<qq>", "\\<rr>", "\\<ss>", "\\<tt>", "\\<uu>", "\\<vv>",
wenzelm@34143
   248
      "\\<ww>", "\\<xx>", "\\<yy>", "\\<zz>",
wenzelm@34143
   249
wenzelm@34143
   250
      "\\<alpha>", "\\<beta>", "\\<gamma>", "\\<delta>", "\\<epsilon>",
wenzelm@34143
   251
      "\\<zeta>", "\\<eta>", "\\<theta>", "\\<iota>", "\\<kappa>",
wenzelm@34143
   252
      "\\<mu>", "\\<nu>", "\\<xi>", "\\<pi>", "\\<rho>", "\\<sigma>",
wenzelm@34143
   253
      "\\<tau>", "\\<upsilon>", "\\<phi>", "\\<chi>", "\\<psi>",
wenzelm@34143
   254
      "\\<omega>", "\\<Gamma>", "\\<Delta>", "\\<Theta>", "\\<Lambda>",
wenzelm@34143
   255
      "\\<Xi>", "\\<Pi>", "\\<Sigma>", "\\<Upsilon>", "\\<Phi>",
wenzelm@34143
   256
      "\\<Psi>", "\\<Omega>",
wenzelm@34143
   257
wenzelm@34143
   258
      "\\<^isub>", "\\<^isup>")
wenzelm@34143
   259
wenzelm@34143
   260
    private val letters = raw_letters ++ raw_letters.map(decode(_))
wenzelm@34143
   261
wenzelm@34143
   262
    def is_letter(sym: String): Boolean = letters.contains(sym)
wenzelm@34143
   263
wenzelm@34143
   264
    def is_digit(sym: String): Boolean =
wenzelm@34143
   265
      if (sym.length == 1) {
wenzelm@34143
   266
        val c = sym(0)
wenzelm@34143
   267
        '0' <= c && c <= '9'
wenzelm@34143
   268
      }
wenzelm@34143
   269
      else false
wenzelm@34143
   270
wenzelm@34143
   271
    def is_quasi(sym: String): Boolean = sym == "_" || sym == "'"
wenzelm@34143
   272
wenzelm@34143
   273
wenzelm@34143
   274
    private val raw_blanks =
wenzelm@34143
   275
      Set(" ", "\t", "\n", "\u000B", "\f", "\r", "\\<spacespace>", "\\<^newline>")
wenzelm@34143
   276
    private val blanks = raw_blanks ++ raw_blanks.map(decode(_))
wenzelm@34143
   277
wenzelm@34143
   278
    def is_blank(sym: String): Boolean = blanks.contains(sym)
wenzelm@27918
   279
  }
wenzelm@27901
   280
}