wenzelm@27901: /* Title: Pure/General/symbol.scala wenzelm@27901: Author: Makarius wenzelm@27901: wenzelm@27924: Detecting and recoding Isabelle symbols. wenzelm@27901: */ wenzelm@27901: wenzelm@27901: package isabelle wenzelm@27901: wenzelm@27918: import scala.io.Source wenzelm@36035: import scala.collection.mutable wenzelm@31537: import scala.util.matching.Regex wenzelm@27901: wenzelm@27901: wenzelm@31537: object Symbol wenzelm@31537: { wenzelm@44570: type Symbol = String wenzelm@44570: wenzelm@44570: wenzelm@36772: /* spaces */ wenzelm@36772: wenzelm@36850: val spc = ' ' wenzelm@36850: val space = " " wenzelm@36850: wenzelm@36850: private val static_spaces = space * 4000 wenzelm@36772: wenzelm@36772: def spaces(k: Int): String = wenzelm@36772: { wenzelm@36772: require(k >= 0) wenzelm@36772: if (k < static_spaces.length) static_spaces.substring(0, k) wenzelm@36850: else space * k wenzelm@36772: } wenzelm@36772: wenzelm@36772: wenzelm@44292: /* ASCII characters */ wenzelm@44292: wenzelm@44292: def is_ascii_letter(c: Char): Boolean = 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z' wenzelm@44292: def is_ascii_digit(c: Char): Boolean = '0' <= c && c <= '9' wenzelm@44292: def is_ascii_quasi(c: Char): Boolean = c == '_' || c == '\'' wenzelm@44292: wenzelm@44292: def is_ascii_letdig(c: Char): Boolean = wenzelm@44292: is_ascii_letter(c) || is_ascii_digit(c) || is_ascii_quasi(c) wenzelm@44292: wenzelm@44292: def is_ascii_identifier(s: String): Boolean = wenzelm@44292: s.length > 0 && is_ascii_letter(s(0)) && s.substring(1).forall(is_ascii_letdig) wenzelm@44292: wenzelm@44292: wenzelm@34007: /* Symbol regexps */ wenzelm@27901: wenzelm@31537: private val plain = new Regex("""(?xs) wenzelm@40770: [^\r\\\ud800-\udfff\ufffd] | [\ud800-\udbff][\udc00-\udfff] """) wenzelm@37563: wenzelm@40768: private val physical_newline = new Regex("""(?xs) \n | \r\n | \r """) wenzelm@27901: wenzelm@31537: private val symbol = new Regex("""(?xs) wenzelm@31548: \\ < (?: wenzelm@27924: \^? [A-Za-z][A-Za-z0-9_']* | wenzelm@27924: \^raw: [\x20-\x7e\u0100-\uffff && [^.>]]* ) >""") wenzelm@27923: wenzelm@40769: private val malformed_symbol = new Regex("(?xs) (?!" + symbol + ")" + wenzelm@40775: """ [\ud800-\udbff\ufffd] | \\<\^? """) wenzelm@27923: wenzelm@40769: val regex_total = wenzelm@40769: new Regex(plain + "|" + physical_newline + "|" + symbol + "|" + malformed_symbol + "| .") wenzelm@27918: wenzelm@34146: wenzelm@34146: /* basic matching */ wenzelm@34146: wenzelm@37563: def is_plain(c: Char): Boolean = !(c == '\r' || c == '\\' || '\ud800' <= c && c <= '\udfff') wenzelm@34146: wenzelm@44570: def is_physical_newline(s: Symbol): Boolean = wenzelm@44550: s == "\n" || s == "\r" || s == "\r\n" wenzelm@39203: wenzelm@44570: def is_malformed(s: Symbol): Boolean = wenzelm@44550: !(s.length == 1 && is_plain(s(0))) && malformed_symbol.pattern.matcher(s).matches wenzelm@34146: wenzelm@34146: class Matcher(text: CharSequence) wenzelm@34146: { wenzelm@40769: private val matcher = regex_total.pattern.matcher(text) wenzelm@34146: def apply(start: Int, end: Int): Int = wenzelm@34146: { wenzelm@34146: require(0 <= start && start < end && end <= text.length) wenzelm@34319: if (is_plain(text.charAt(start))) 1 wenzelm@34147: else { wenzelm@34146: matcher.region(start, end).lookingAt wenzelm@34146: matcher.group.length wenzelm@34146: } wenzelm@34146: } wenzelm@31537: } wenzelm@27901: wenzelm@27901: wenzelm@44569: /* iterator */ wenzelm@31939: wenzelm@44570: private val char_symbols: Array[Symbol] = wenzelm@44550: (0 until 256).iterator.map(i => new String(Array(i.toChar))).toArray wenzelm@44550: wenzelm@44570: def iterator(text: CharSequence): Iterator[Symbol] = wenzelm@44570: new Iterator[Symbol] wenzelm@40768: { wenzelm@44360: private val matcher = new Matcher(text) wenzelm@44360: private var i = 0 wenzelm@44360: def hasNext = i < text.length wenzelm@44360: def next = wenzelm@44360: { wenzelm@44360: val n = matcher(i, text.length) wenzelm@44550: val s = wenzelm@44550: if (n == 0) "" wenzelm@44550: else if (n == 1) { wenzelm@44550: val c = text.charAt(i) wenzelm@44550: if (c < char_symbols.length) char_symbols(c) wenzelm@44550: else text.subSequence(i, i + n).toString wenzelm@44550: } wenzelm@44550: else text.subSequence(i, i + n).toString wenzelm@44360: i += n wenzelm@44360: s wenzelm@44360: } wenzelm@34007: } wenzelm@44360: wenzelm@34007: wenzelm@34007: /* decoding offsets */ wenzelm@34007: wenzelm@34007: class Index(text: CharSequence) wenzelm@31939: { wenzelm@44596: sealed case class Entry(chr: Int, sym: Int) wenzelm@31939: val index: Array[Entry] = wenzelm@31939: { wenzelm@34146: val matcher = new Matcher(text) wenzelm@31939: val buf = new mutable.ArrayBuffer[Entry] wenzelm@31939: var chr = 0 wenzelm@31939: var sym = 0 wenzelm@34007: while (chr < text.length) { wenzelm@34146: val n = matcher(chr, text.length) wenzelm@34146: chr += n wenzelm@31939: sym += 1 wenzelm@34146: if (n > 1) buf += Entry(chr, sym) wenzelm@31939: } wenzelm@31939: buf.toArray wenzelm@31939: } wenzelm@38797: def decode(sym1: Int): Int = wenzelm@31939: { wenzelm@38797: val sym = sym1 - 1 wenzelm@31939: val end = index.length wenzelm@31939: def bisect(a: Int, b: Int): Int = wenzelm@31939: { wenzelm@31939: if (a < b) { wenzelm@31939: val c = (a + b) / 2 wenzelm@31939: if (sym < index(c).sym) bisect(a, c) wenzelm@31939: else if (c + 1 == end || sym < index(c + 1).sym) c wenzelm@31939: else bisect(c + 1, b) wenzelm@31939: } wenzelm@31939: else -1 wenzelm@31939: } wenzelm@31939: val i = bisect(0, end) wenzelm@31939: if (i < 0) sym wenzelm@31939: else index(i).chr + sym - index(i).sym wenzelm@31939: } wenzelm@38797: def decode(range: Text.Range): Text.Range = range.map(decode(_)) wenzelm@31939: } wenzelm@31939: wenzelm@31939: wenzelm@34007: /* recoding text */ wenzelm@27924: wenzelm@31537: private class Recoder(list: List[(String, String)]) wenzelm@31537: { wenzelm@31537: private val (min, max) = wenzelm@31537: { wenzelm@27937: var min = '\uffff' wenzelm@27937: var max = '\u0000' wenzelm@27937: for ((x, _) <- list) { wenzelm@27937: val c = x(0) wenzelm@27937: if (c < min) min = c wenzelm@27937: if (c > max) max = c wenzelm@27937: } wenzelm@27937: (min, max) wenzelm@27937: } wenzelm@40690: private val table = wenzelm@40690: { wenzelm@40690: var tab = Map[String, String]() wenzelm@40690: for ((x, y) <- list) { wenzelm@40690: tab.get(x) match { wenzelm@40690: case None => tab += (x -> y) wenzelm@40690: case Some(z) => wenzelm@40690: error("Duplicate mapping of \"" + x + "\" to \"" + y + "\" vs. \"" + z + "\"") wenzelm@40690: } wenzelm@40690: } wenzelm@40690: tab wenzelm@40690: } wenzelm@31537: def recode(text: String): String = wenzelm@31537: { wenzelm@27937: val len = text.length wenzelm@40769: val matcher = regex_total.pattern.matcher(text) wenzelm@27937: val result = new StringBuilder(len) wenzelm@27937: var i = 0 wenzelm@27937: while (i < len) { wenzelm@27937: val c = text(i) wenzelm@27937: if (min <= c && c <= max) { wenzelm@31939: matcher.region(i, len).lookingAt wenzelm@27938: val x = matcher.group wenzelm@31537: result.append(table.get(x) getOrElse x) wenzelm@27937: i = matcher.end wenzelm@27937: } wenzelm@27937: else { result.append(c); i += 1 } wenzelm@27937: } wenzelm@27937: result.toString wenzelm@27924: } wenzelm@27924: } wenzelm@27918: wenzelm@27918: wenzelm@27937: wenzelm@44569: /** symbol interpretation **/ wenzelm@27937: wenzelm@44569: private lazy val symbols = wenzelm@44569: new Interpretation( wenzelm@44569: Isabelle_System.try_read(Path.split(Isabelle_System.getenv_strict("ISABELLE_SYMBOLS")))) wenzelm@44569: wenzelm@44569: private class Interpretation(symbols_spec: String) wenzelm@29569: { wenzelm@31537: /* read symbols */ wenzelm@31537: wenzelm@31537: private val empty = new Regex("""(?xs) ^\s* (?: \#.* )? $ """) wenzelm@31537: private val key = new Regex("""(?xs) (.+): """) wenzelm@31537: wenzelm@44570: private def read_decl(decl: String): (Symbol, Map[String, String]) = wenzelm@31537: { wenzelm@31537: def err() = error("Bad symbol declaration: " + decl) wenzelm@31537: wenzelm@31537: def read_props(props: List[String]): Map[String, String] = wenzelm@31537: { wenzelm@31537: props match { wenzelm@31537: case Nil => Map() wenzelm@31537: case _ :: Nil => err() wenzelm@31537: case key(x) :: y :: rest => read_props(rest) + (x -> y) wenzelm@31537: case _ => err() wenzelm@31537: } wenzelm@31537: } wenzelm@31537: decl.split("\\s+").toList match { wenzelm@40769: case sym :: props if sym.length > 1 && !is_malformed(sym) => (sym, read_props(props)) wenzelm@34193: case _ => err() wenzelm@31537: } wenzelm@31537: } wenzelm@31537: wenzelm@44570: private val symbols: List[(Symbol, Map[String, String])] = wenzelm@40690: Map(( wenzelm@44569: for (decl <- split_lines(symbols_spec) if !empty.pattern.matcher(decl).matches) wenzelm@40690: yield read_decl(decl)): _*) toList wenzelm@31537: wenzelm@31537: wenzelm@31651: /* misc properties */ wenzelm@31651: wenzelm@44570: val names: Map[Symbol, String] = wenzelm@34143: { wenzelm@44331: val name = new Regex("""\\<\^?([A-Za-z][A-Za-z0-9_']*)>""") wenzelm@31651: Map((for ((sym @ name(a), _) <- symbols) yield (sym -> a)): _*) wenzelm@31651: } wenzelm@31651: wenzelm@44570: val abbrevs: Map[Symbol, String] = wenzelm@44359: Map(( wenzelm@44359: for ((sym, props) <- symbols if props.isDefinedAt("abbrev")) wenzelm@44359: yield (sym -> props("abbrev"))): _*) wenzelm@44359: wenzelm@44359: wenzelm@44361: /* recoding */ wenzelm@31537: wenzelm@31537: private val (decoder, encoder) = wenzelm@31537: { wenzelm@31537: val mapping = wenzelm@31537: for { wenzelm@31537: (sym, props) <- symbols wenzelm@31537: val code = wenzelm@31537: try { Integer.decode(props("code")).intValue } wenzelm@31537: catch { wenzelm@31537: case _: NoSuchElementException => error("Missing code for symbol " + sym) wenzelm@31537: case _: NumberFormatException => error("Bad code for symbol " + sym) wenzelm@31537: } wenzelm@31537: val ch = new String(Character.toChars(code)) wenzelm@34193: } yield { wenzelm@34193: if (code < 128) error("Illegal ASCII code for symbol " + sym) wenzelm@34193: else (sym, ch) wenzelm@34193: } wenzelm@31548: (new Recoder(mapping), wenzelm@31551: new Recoder(mapping map { case (x, y) => (y, x) })) wenzelm@31537: } wenzelm@27918: wenzelm@34104: def decode(text: String): String = decoder.recode(text) wenzelm@34104: def encode(text: String): String = encoder.recode(text) wenzelm@34143: wenzelm@44361: private def recode_set(elems: String*): Set[String] = wenzelm@44361: { wenzelm@44361: val content = elems.toList wenzelm@44361: Set((content ::: content.map(decode)): _*) wenzelm@44361: } wenzelm@44361: wenzelm@44361: private def recode_map[A](elems: (String, A)*): Map[String, A] = wenzelm@44361: { wenzelm@44361: val content = elems.toList wenzelm@44361: Map((content ::: content.map({ case (sym, a) => (decode(sym), a) })): _*) wenzelm@44361: } wenzelm@44361: wenzelm@44361: wenzelm@44361: /* user fonts */ wenzelm@44361: wenzelm@44570: val fonts: Map[Symbol, String] = wenzelm@44361: recode_map(( wenzelm@44361: for ((sym, props) <- symbols if props.isDefinedAt("font")) wenzelm@44361: yield (sym -> props("font"))): _*) wenzelm@44361: wenzelm@44361: val font_names: List[String] = Set(fonts.toList.map(_._2): _*).toList wenzelm@44361: val font_index: Map[String, Int] = Map((font_names zip (0 until font_names.length).toList): _*) wenzelm@44361: wenzelm@34143: wenzelm@34143: /* classification */ wenzelm@34143: wenzelm@44569: val letters = recode_set( wenzelm@34143: "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", wenzelm@34143: "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", wenzelm@34143: "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", wenzelm@34143: "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", wenzelm@34143: wenzelm@34143: "\\", "\\", "\\", "\\", "\\", "\\", "\\", wenzelm@34143: "\\", "\\", "\\", "\\", "\\", "\\", "\\", wenzelm@34143: "\\", "\\

", wenzelm@34143: "\\", "\\", "\\", "\\", "\\", "\\", "\\", wenzelm@34143: "\\", "\\", "\\", wenzelm@34143: wenzelm@34143: "\\", "\\", "\\", "\\

", "\\", "\\", wenzelm@34143: "\\", "\\", "\\", "\\", "\\", "\\", wenzelm@34143: "\\", "\\", "\\", "\\", "\\", "\\", wenzelm@34143: "\\", "\\", "\\", "\\", "\\", "\\", wenzelm@34143: "\\", "\\", "\\", "\\", "\\", "\\
", wenzelm@34143: "\\", "\\", "\\", "\\", "\\", "\\", wenzelm@34143: "\\", "\\", "\\", "\\", "\\", "\\", wenzelm@34143: "\\", "\\", "\\", "\\", "\\", "\\", wenzelm@34143: "\\", "\\", "\\", "\\", wenzelm@34143: wenzelm@34143: "\\", "\\", "\\", "\\", "\\", wenzelm@34143: "\\", "\\", "\\", "\\", "\\", wenzelm@34143: "\\", "\\", "\\", "\\", "\\", "\\", wenzelm@34143: "\\", "\\", "\\", "\\", "\\", wenzelm@34143: "\\", "\\", "\\", "\\", "\\", wenzelm@34143: "\\", "\\", "\\", "\\", "\\", wenzelm@34143: "\\", "\\", wenzelm@34143: wenzelm@34143: "\\<^isub>", "\\<^isup>") wenzelm@34143: wenzelm@44569: val blanks = wenzelm@44361: recode_set(space, "\t", "\n", "\u000B", "\f", "\r", "\\", "\\<^newline>") wenzelm@34147: wenzelm@44569: val sym_chars = wenzelm@34147: Set("!", "#", "$", "%", "&", "*", "+", "-", "/", "<", "=", ">", "?", "@", "^", "_", "|", "~") wenzelm@34143: wenzelm@44330: wenzelm@44359: /* control symbols */ wenzelm@44359: wenzelm@44570: val ctrl_decoded: Set[Symbol] = wenzelm@44359: Set((for ((sym, _) <- symbols if sym.startsWith("\\<^")) yield decode(sym)): _*) wenzelm@44359: wenzelm@44569: val subscript_decoded = Set(decode("\\<^sub>"), decode("\\<^isub>")) wenzelm@44569: val superscript_decoded = Set(decode("\\<^sup>"), decode("\\<^isup>")) wenzelm@44382: wenzelm@44382: val bold_decoded = decode("\\<^bold>") wenzelm@44382: val bsub_decoded = decode("\\<^bsub>") wenzelm@44382: val esub_decoded = decode("\\<^esub>") wenzelm@44382: val bsup_decoded = decode("\\<^bsup>") wenzelm@44382: val esup_decoded = decode("\\<^esup>") wenzelm@27918: } wenzelm@44569: wenzelm@44569: wenzelm@44569: /* tables */ wenzelm@44569: wenzelm@44570: def names: Map[Symbol, String] = symbols.names wenzelm@44570: def abbrevs: Map[Symbol, String] = symbols.abbrevs wenzelm@44569: wenzelm@44569: def decode(text: String): String = symbols.decode(text) wenzelm@44569: def encode(text: String): String = symbols.encode(text) wenzelm@44569: wenzelm@44570: def fonts: Map[Symbol, String] = symbols.fonts wenzelm@44569: def font_names: List[String] = symbols.font_names wenzelm@44569: def font_index: Map[String, Int] = symbols.font_index wenzelm@44570: def lookup_font(sym: Symbol): Option[Int] = symbols.fonts.get(sym).map(font_index(_)) wenzelm@44569: wenzelm@44569: wenzelm@44569: /* classification */ wenzelm@44569: wenzelm@44570: def is_letter(sym: Symbol): Boolean = symbols.letters.contains(sym) wenzelm@44570: def is_digit(sym: Symbol): Boolean = sym.length == 1 && '0' <= sym(0) && sym(0) <= '9' wenzelm@44570: def is_quasi(sym: Symbol): Boolean = sym == "_" || sym == "'" wenzelm@44570: def is_letdig(sym: Symbol): Boolean = is_letter(sym) || is_digit(sym) || is_quasi(sym) wenzelm@44570: def is_blank(sym: Symbol): Boolean = symbols.blanks.contains(sym) wenzelm@44570: def is_symbolic_char(sym: Symbol): Boolean = symbols.sym_chars.contains(sym) wenzelm@44570: def is_symbolic(sym: Symbol): Boolean = wenzelm@44569: sym.startsWith("\\<") && sym.endsWith(">") && !sym.startsWith("\\<^") wenzelm@44569: wenzelm@44569: wenzelm@44569: /* control symbols */ wenzelm@44569: wenzelm@44570: def is_ctrl(sym: Symbol): Boolean = wenzelm@44569: sym.startsWith("\\<^") || symbols.ctrl_decoded.contains(sym) wenzelm@44569: wenzelm@44570: def is_controllable(sym: Symbol): Boolean = wenzelm@44569: !is_blank(sym) && !is_ctrl(sym) && !is_malformed(sym) wenzelm@44569: wenzelm@44570: def is_subscript_decoded(sym: Symbol): Boolean = symbols.subscript_decoded.contains(sym) wenzelm@44570: def is_superscript_decoded(sym: Symbol): Boolean = symbols.superscript_decoded.contains(sym) wenzelm@44570: def is_bold_decoded(sym: Symbol): Boolean = sym == symbols.bold_decoded wenzelm@44570: def is_bsub_decoded(sym: Symbol): Boolean = sym == symbols.bsub_decoded wenzelm@44570: def is_esub_decoded(sym: Symbol): Boolean = sym == symbols.esub_decoded wenzelm@44570: def is_bsup_decoded(sym: Symbol): Boolean = sym == symbols.bsup_decoded wenzelm@44570: def is_esup_decoded(sym: Symbol): Boolean = sym == symbols.esup_decoded wenzelm@27901: }

", "\\", "\\", "\\", "\\", "\\", wenzelm@34143: "\\", "\\", "\\", "\\", "\\", "\\", "\\", wenzelm@34143: "\\", "\\", "\\", "\\", "\\", "\\", "\\", wenzelm@34143: "\\", "\\", "\\", "\\", "\\", "\\", "\\