src/Pure/General/symbol.scala
author wenzelm
Sat, 13 Nov 2010 19:27:41 +0100
changeset 40770 6131d7a78ad3
parent 40769 1050315f6ee2
child 40775 d5fb1f1a5857
permissions -rw-r--r--
treat Unicode "replacement character" (i.e. decoding error) is malformed;
wenzelm@27901
     1
/*  Title:      Pure/General/symbol.scala
wenzelm@27901
     2
    Author:     Makarius
wenzelm@27901
     3
wenzelm@27924
     4
Detecting and recoding Isabelle symbols.
wenzelm@27901
     5
*/
wenzelm@27901
     6
wenzelm@27901
     7
package isabelle
wenzelm@27901
     8
wenzelm@27918
     9
import scala.io.Source
wenzelm@36035
    10
import scala.collection.mutable
wenzelm@31537
    11
import scala.util.matching.Regex
wenzelm@27901
    12
wenzelm@27901
    13
wenzelm@31537
    14
object Symbol
wenzelm@31537
    15
{
wenzelm@36772
    16
  /* spaces */
wenzelm@36772
    17
wenzelm@36850
    18
  val spc = ' '
wenzelm@36850
    19
  val space = " "
wenzelm@36850
    20
wenzelm@36850
    21
  private val static_spaces = space * 4000
wenzelm@36772
    22
wenzelm@36772
    23
  def spaces(k: Int): String =
wenzelm@36772
    24
  {
wenzelm@36772
    25
    require(k >= 0)
wenzelm@36772
    26
    if (k < static_spaces.length) static_spaces.substring(0, k)
wenzelm@36850
    27
    else space * k
wenzelm@36772
    28
  }
wenzelm@36772
    29
wenzelm@36772
    30
wenzelm@34007
    31
  /* Symbol regexps */
wenzelm@27901
    32
wenzelm@31537
    33
  private val plain = new Regex("""(?xs)
wenzelm@40770
    34
      [^\r\\\ud800-\udfff\ufffd] | [\ud800-\udbff][\udc00-\udfff] """)
wenzelm@37563
    35
wenzelm@40768
    36
  private val physical_newline = new Regex("""(?xs) \n | \r\n | \r """)
wenzelm@27901
    37
wenzelm@31537
    38
  private val symbol = new Regex("""(?xs)
wenzelm@31548
    39
      \\ < (?:
wenzelm@27924
    40
      \^? [A-Za-z][A-Za-z0-9_']* |
wenzelm@27924
    41
      \^raw: [\x20-\x7e\u0100-\uffff && [^.>]]* ) >""")
wenzelm@27923
    42
wenzelm@40769
    43
  private val malformed_symbol = new Regex("(?xs) (?!" + symbol + ")" +
wenzelm@40770
    44
    """ [\ud800-\udbff\ufffd] | \\<^? """)
wenzelm@27923
    45
wenzelm@40769
    46
  val regex_total =
wenzelm@40769
    47
    new Regex(plain + "|" + physical_newline + "|" + symbol + "|" + malformed_symbol + "| .")
wenzelm@27918
    48
wenzelm@34146
    49
wenzelm@34146
    50
  /* basic matching */
wenzelm@34146
    51
wenzelm@37563
    52
  def is_plain(c: Char): Boolean = !(c == '\r' || c == '\\' || '\ud800' <= c && c <= '\udfff')
wenzelm@34146
    53
wenzelm@39203
    54
  def is_physical_newline(s: CharSequence): Boolean =
wenzelm@39203
    55
    "\n".contentEquals(s) || "\r".contentEquals(s) || "\r\n".contentEquals(s)
wenzelm@39203
    56
wenzelm@40769
    57
  def is_malformed(s: CharSequence): Boolean =
wenzelm@40769
    58
    !(s.length == 1 && is_plain(s.charAt(0))) && malformed_symbol.pattern.matcher(s).matches
wenzelm@34146
    59
wenzelm@34146
    60
  class Matcher(text: CharSequence)
wenzelm@34146
    61
  {
wenzelm@40769
    62
    private val matcher = regex_total.pattern.matcher(text)
wenzelm@34146
    63
    def apply(start: Int, end: Int): Int =
wenzelm@34146
    64
    {
wenzelm@34146
    65
      require(0 <= start && start < end && end <= text.length)
wenzelm@34319
    66
      if (is_plain(text.charAt(start))) 1
wenzelm@34147
    67
      else {
wenzelm@34146
    68
        matcher.region(start, end).lookingAt
wenzelm@34146
    69
        matcher.group.length
wenzelm@34146
    70
      }
wenzelm@34146
    71
    }
wenzelm@31537
    72
  }
wenzelm@27901
    73
wenzelm@27901
    74
wenzelm@36035
    75
  /* iterator */
wenzelm@31939
    76
wenzelm@36035
    77
  def iterator(text: CharSequence) = new Iterator[CharSequence]
wenzelm@34143
    78
  {
wenzelm@34146
    79
    private val matcher = new Matcher(text)
wenzelm@34007
    80
    private var i = 0
wenzelm@34007
    81
    def hasNext = i < text.length
wenzelm@40768
    82
    def next =
wenzelm@40768
    83
    {
wenzelm@34146
    84
      val n = matcher(i, text.length)
wenzelm@34146
    85
      val s = text.subSequence(i, i + n)
wenzelm@34146
    86
      i += n
wenzelm@34146
    87
      s
wenzelm@34007
    88
    }
wenzelm@34007
    89
  }
wenzelm@34007
    90
wenzelm@34007
    91
wenzelm@34007
    92
  /* decoding offsets */
wenzelm@34007
    93
wenzelm@34007
    94
  class Index(text: CharSequence)
wenzelm@31939
    95
  {
wenzelm@31939
    96
    case class Entry(chr: Int, sym: Int)
wenzelm@31939
    97
    val index: Array[Entry] =
wenzelm@31939
    98
    {
wenzelm@34146
    99
      val matcher = new Matcher(text)
wenzelm@31939
   100
      val buf = new mutable.ArrayBuffer[Entry]
wenzelm@31939
   101
      var chr = 0
wenzelm@31939
   102
      var sym = 0
wenzelm@34007
   103
      while (chr < text.length) {
wenzelm@34146
   104
        val n = matcher(chr, text.length)
wenzelm@34146
   105
        chr += n
wenzelm@31939
   106
        sym += 1
wenzelm@34146
   107
        if (n > 1) buf += Entry(chr, sym)
wenzelm@31939
   108
      }
wenzelm@31939
   109
      buf.toArray
wenzelm@31939
   110
    }
wenzelm@38797
   111
    def decode(sym1: Int): Int =
wenzelm@31939
   112
    {
wenzelm@38797
   113
      val sym = sym1 - 1
wenzelm@31939
   114
      val end = index.length
wenzelm@31939
   115
      def bisect(a: Int, b: Int): Int =
wenzelm@31939
   116
      {
wenzelm@31939
   117
        if (a < b) {
wenzelm@31939
   118
          val c = (a + b) / 2
wenzelm@31939
   119
          if (sym < index(c).sym) bisect(a, c)
wenzelm@31939
   120
          else if (c + 1 == end || sym < index(c + 1).sym) c
wenzelm@31939
   121
          else bisect(c + 1, b)
wenzelm@31939
   122
        }
wenzelm@31939
   123
        else -1
wenzelm@31939
   124
      }
wenzelm@31939
   125
      val i = bisect(0, end)
wenzelm@31939
   126
      if (i < 0) sym
wenzelm@31939
   127
      else index(i).chr + sym - index(i).sym
wenzelm@31939
   128
    }
wenzelm@38797
   129
    def decode(range: Text.Range): Text.Range = range.map(decode(_))
wenzelm@31939
   130
  }
wenzelm@31939
   131
wenzelm@31939
   132
wenzelm@34007
   133
  /* recoding text */
wenzelm@27924
   134
wenzelm@31537
   135
  private class Recoder(list: List[(String, String)])
wenzelm@31537
   136
  {
wenzelm@31537
   137
    private val (min, max) =
wenzelm@31537
   138
    {
wenzelm@27937
   139
      var min = '\uffff'
wenzelm@27937
   140
      var max = '\u0000'
wenzelm@27937
   141
      for ((x, _) <- list) {
wenzelm@27937
   142
        val c = x(0)
wenzelm@27937
   143
        if (c < min) min = c
wenzelm@27937
   144
        if (c > max) max = c
wenzelm@27937
   145
      }
wenzelm@27937
   146
      (min, max)
wenzelm@27937
   147
    }
wenzelm@40690
   148
    private val table =
wenzelm@40690
   149
    {
wenzelm@40690
   150
      var tab = Map[String, String]()
wenzelm@40690
   151
      for ((x, y) <- list) {
wenzelm@40690
   152
        tab.get(x) match {
wenzelm@40690
   153
          case None => tab += (x -> y)
wenzelm@40690
   154
          case Some(z) =>
wenzelm@40690
   155
            error("Duplicate mapping of \"" + x + "\" to \"" + y + "\" vs. \"" + z + "\"")
wenzelm@40690
   156
        }
wenzelm@40690
   157
      }
wenzelm@40690
   158
      tab
wenzelm@40690
   159
    }
wenzelm@31537
   160
    def recode(text: String): String =
wenzelm@31537
   161
    {
wenzelm@27937
   162
      val len = text.length
wenzelm@40769
   163
      val matcher = regex_total.pattern.matcher(text)
wenzelm@27937
   164
      val result = new StringBuilder(len)
wenzelm@27937
   165
      var i = 0
wenzelm@27937
   166
      while (i < len) {
wenzelm@27937
   167
        val c = text(i)
wenzelm@27937
   168
        if (min <= c && c <= max) {
wenzelm@31939
   169
          matcher.region(i, len).lookingAt
wenzelm@27938
   170
          val x = matcher.group
wenzelm@31537
   171
          result.append(table.get(x) getOrElse x)
wenzelm@27937
   172
          i = matcher.end
wenzelm@27937
   173
        }
wenzelm@27937
   174
        else { result.append(c); i += 1 }
wenzelm@27937
   175
      }
wenzelm@27937
   176
      result.toString
wenzelm@27924
   177
    }
wenzelm@27924
   178
  }
wenzelm@27918
   179
wenzelm@27918
   180
wenzelm@27937
   181
wenzelm@27937
   182
  /** Symbol interpretation **/
wenzelm@27937
   183
wenzelm@34146
   184
  class Interpretation(symbol_decls: List[String])
wenzelm@29569
   185
  {
wenzelm@31537
   186
    /* read symbols */
wenzelm@31537
   187
wenzelm@31537
   188
    private val empty = new Regex("""(?xs) ^\s* (?: \#.* )? $ """)
wenzelm@31537
   189
    private val key = new Regex("""(?xs) (.+): """)
wenzelm@31537
   190
wenzelm@31537
   191
    private def read_decl(decl: String): (String, Map[String, String]) =
wenzelm@31537
   192
    {
wenzelm@31537
   193
      def err() = error("Bad symbol declaration: " + decl)
wenzelm@31537
   194
wenzelm@31537
   195
      def read_props(props: List[String]): Map[String, String] =
wenzelm@31537
   196
      {
wenzelm@31537
   197
        props match {
wenzelm@31537
   198
          case Nil => Map()
wenzelm@31537
   199
          case _ :: Nil => err()
wenzelm@31537
   200
          case key(x) :: y :: rest => read_props(rest) + (x -> y)
wenzelm@31537
   201
          case _ => err()
wenzelm@31537
   202
        }
wenzelm@31537
   203
      }
wenzelm@31537
   204
      decl.split("\\s+").toList match {
wenzelm@40769
   205
        case sym :: props if sym.length > 1 && !is_malformed(sym) => (sym, read_props(props))
wenzelm@34193
   206
        case _ => err()
wenzelm@31537
   207
      }
wenzelm@31537
   208
    }
wenzelm@31537
   209
wenzelm@31537
   210
    private val symbols: List[(String, Map[String, String])] =
wenzelm@40690
   211
      Map((
wenzelm@40690
   212
        for (decl <- symbol_decls if !empty.pattern.matcher(decl).matches)
wenzelm@40690
   213
          yield read_decl(decl)): _*) toList
wenzelm@31537
   214
wenzelm@31537
   215
wenzelm@31651
   216
    /* misc properties */
wenzelm@31651
   217
wenzelm@34143
   218
    val names: Map[String, String] =
wenzelm@34143
   219
    {
wenzelm@31651
   220
      val name = new Regex("""\\<([A-Za-z][A-Za-z0-9_']*)>""")
wenzelm@31651
   221
      Map((for ((sym @ name(a), _) <- symbols) yield (sym -> a)): _*)
wenzelm@31651
   222
    }
wenzelm@31651
   223
wenzelm@40690
   224
    val abbrevs: Map[String, String] =
wenzelm@40690
   225
      Map((
wenzelm@40690
   226
        for ((sym, props) <- symbols if props.isDefinedAt("abbrev"))
wenzelm@40690
   227
          yield (sym -> props("abbrev"))): _*)
wenzelm@31651
   228
wenzelm@31651
   229
wenzelm@31537
   230
    /* main recoder methods */
wenzelm@31537
   231
wenzelm@31537
   232
    private val (decoder, encoder) =
wenzelm@31537
   233
    {
wenzelm@31537
   234
      val mapping =
wenzelm@31537
   235
        for {
wenzelm@31537
   236
          (sym, props) <- symbols
wenzelm@31537
   237
          val code =
wenzelm@31537
   238
            try { Integer.decode(props("code")).intValue }
wenzelm@31537
   239
            catch {
wenzelm@31537
   240
              case _: NoSuchElementException => error("Missing code for symbol " + sym)
wenzelm@31537
   241
              case _: NumberFormatException => error("Bad code for symbol " + sym)
wenzelm@31537
   242
            }
wenzelm@31537
   243
          val ch = new String(Character.toChars(code))
wenzelm@34193
   244
        } yield {
wenzelm@34193
   245
          if (code < 128) error("Illegal ASCII code for symbol " + sym)
wenzelm@34193
   246
          else (sym, ch)
wenzelm@34193
   247
        }
wenzelm@31548
   248
      (new Recoder(mapping),
wenzelm@31551
   249
       new Recoder(mapping map { case (x, y) => (y, x) }))
wenzelm@31537
   250
    }
wenzelm@27918
   251
wenzelm@34104
   252
    def decode(text: String): String = decoder.recode(text)
wenzelm@34104
   253
    def encode(text: String): String = encoder.recode(text)
wenzelm@34143
   254
wenzelm@34143
   255
wenzelm@34143
   256
    /* classification */
wenzelm@34143
   257
wenzelm@34147
   258
    private object Decode_Set
wenzelm@34147
   259
    {
wenzelm@34147
   260
      def apply(elems: String*): Set[String] =
wenzelm@34147
   261
      {
wenzelm@34147
   262
        val content = elems.toList
wenzelm@34147
   263
        Set((content ::: content.map(decode)): _*)
wenzelm@34147
   264
      }
wenzelm@34147
   265
    }
wenzelm@34147
   266
wenzelm@34147
   267
    private val letters = Decode_Set(
wenzelm@34143
   268
      "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M",
wenzelm@34143
   269
      "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z",
wenzelm@34143
   270
      "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m",
wenzelm@34143
   271
      "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z",
wenzelm@34143
   272
wenzelm@34143
   273
      "\\<A>", "\\<B>", "\\<C>", "\\<D>", "\\<E>", "\\<F>", "\\<G>",
wenzelm@34143
   274
      "\\<H>", "\\<I>", "\\<J>", "\\<K>", "\\<L>", "\\<M>", "\\<N>",
wenzelm@34143
   275
      "\\<O>", "\\<P>", "\\<Q>", "\\<R>", "\\<S>", "\\<T>", "\\<U>",
wenzelm@34143
   276
      "\\<V>", "\\<W>", "\\<X>", "\\<Y>", "\\<Z>", "\\<a>", "\\<b>",
wenzelm@34143
   277
      "\\<c>", "\\<d>", "\\<e>", "\\<f>", "\\<g>", "\\<h>", "\\<i>",
wenzelm@34143
   278
      "\\<j>", "\\<k>", "\\<l>", "\\<m>", "\\<n>", "\\<o>", "\\<p>",
wenzelm@34143
   279
      "\\<q>", "\\<r>", "\\<s>", "\\<t>", "\\<u>", "\\<v>", "\\<w>",
wenzelm@34143
   280
      "\\<x>", "\\<y>", "\\<z>",
wenzelm@34143
   281
wenzelm@34143
   282
      "\\<AA>", "\\<BB>", "\\<CC>", "\\<DD>", "\\<EE>", "\\<FF>",
wenzelm@34143
   283
      "\\<GG>", "\\<HH>", "\\<II>", "\\<JJ>", "\\<KK>", "\\<LL>",
wenzelm@34143
   284
      "\\<MM>", "\\<NN>", "\\<OO>", "\\<PP>", "\\<QQ>", "\\<RR>",
wenzelm@34143
   285
      "\\<SS>", "\\<TT>", "\\<UU>", "\\<VV>", "\\<WW>", "\\<XX>",
wenzelm@34143
   286
      "\\<YY>", "\\<ZZ>", "\\<aa>", "\\<bb>", "\\<cc>", "\\<dd>",
wenzelm@34143
   287
      "\\<ee>", "\\<ff>", "\\<gg>", "\\<hh>", "\\<ii>", "\\<jj>",
wenzelm@34143
   288
      "\\<kk>", "\\<ll>", "\\<mm>", "\\<nn>", "\\<oo>", "\\<pp>",
wenzelm@34143
   289
      "\\<qq>", "\\<rr>", "\\<ss>", "\\<tt>", "\\<uu>", "\\<vv>",
wenzelm@34143
   290
      "\\<ww>", "\\<xx>", "\\<yy>", "\\<zz>",
wenzelm@34143
   291
wenzelm@34143
   292
      "\\<alpha>", "\\<beta>", "\\<gamma>", "\\<delta>", "\\<epsilon>",
wenzelm@34143
   293
      "\\<zeta>", "\\<eta>", "\\<theta>", "\\<iota>", "\\<kappa>",
wenzelm@34143
   294
      "\\<mu>", "\\<nu>", "\\<xi>", "\\<pi>", "\\<rho>", "\\<sigma>",
wenzelm@34143
   295
      "\\<tau>", "\\<upsilon>", "\\<phi>", "\\<chi>", "\\<psi>",
wenzelm@34143
   296
      "\\<omega>", "\\<Gamma>", "\\<Delta>", "\\<Theta>", "\\<Lambda>",
wenzelm@34143
   297
      "\\<Xi>", "\\<Pi>", "\\<Sigma>", "\\<Upsilon>", "\\<Phi>",
wenzelm@34143
   298
      "\\<Psi>", "\\<Omega>",
wenzelm@34143
   299
wenzelm@34143
   300
      "\\<^isub>", "\\<^isup>")
wenzelm@34143
   301
wenzelm@34147
   302
    private val blanks =
wenzelm@36850
   303
      Decode_Set(space, "\t", "\n", "\u000B", "\f", "\r", "\\<spacespace>", "\\<^newline>")
wenzelm@34147
   304
wenzelm@34147
   305
    private val sym_chars =
wenzelm@34147
   306
      Set("!", "#", "$", "%", "&", "*", "+", "-", "/", "<", "=", ">", "?", "@", "^", "_", "|", "~")
wenzelm@34143
   307
wenzelm@34143
   308
    def is_letter(sym: String): Boolean = letters.contains(sym)
wenzelm@34147
   309
    def is_digit(sym: String): Boolean = sym.length == 1 && '0' <= sym(0) && sym(0) <= '9'
wenzelm@34143
   310
    def is_quasi(sym: String): Boolean = sym == "_" || sym == "'"
wenzelm@34147
   311
    def is_letdig(sym: String): Boolean = is_letter(sym) || is_digit(sym) || is_quasi(sym)
wenzelm@34143
   312
    def is_blank(sym: String): Boolean = blanks.contains(sym)
wenzelm@34147
   313
    def is_symbolic_char(sym: String): Boolean = sym_chars.contains(sym)
wenzelm@40769
   314
    def is_symbolic(sym: String): Boolean =
wenzelm@40769
   315
      sym.startsWith("\\<") && sym.endsWith(">") && !sym.startsWith("\\<^")
wenzelm@27918
   316
  }
wenzelm@27901
   317
}