wenzelm@27901
|
1 |
/* Title: Pure/General/symbol.scala
|
wenzelm@27901
|
2 |
Author: Makarius
|
wenzelm@27901
|
3 |
|
wenzelm@27924
|
4 |
Detecting and recoding Isabelle symbols.
|
wenzelm@27901
|
5 |
*/
|
wenzelm@27901
|
6 |
|
wenzelm@27901
|
7 |
package isabelle
|
wenzelm@27901
|
8 |
|
wenzelm@27918
|
9 |
import scala.io.Source
|
wenzelm@31939
|
10 |
import scala.collection.{jcl, mutable}
|
wenzelm@31537
|
11 |
import scala.util.matching.Regex
|
wenzelm@27901
|
12 |
|
wenzelm@27901
|
13 |
|
wenzelm@31537
|
14 |
object Symbol
|
wenzelm@31537
|
15 |
{
|
wenzelm@34007
|
16 |
/* Symbol regexps */
|
wenzelm@27901
|
17 |
|
wenzelm@31537
|
18 |
private val plain = new Regex("""(?xs)
|
wenzelm@31537
|
19 |
[^\\ \ud800-\udfff] | [\ud800-\udbff][\udc00-\udfff] """)
|
wenzelm@27901
|
20 |
|
wenzelm@31537
|
21 |
private val symbol = new Regex("""(?xs)
|
wenzelm@31548
|
22 |
\\ < (?:
|
wenzelm@27924
|
23 |
\^? [A-Za-z][A-Za-z0-9_']* |
|
wenzelm@27924
|
24 |
\^raw: [\x20-\x7e\u0100-\uffff && [^.>]]* ) >""")
|
wenzelm@27923
|
25 |
|
wenzelm@31537
|
26 |
private val bad_symbol = new Regex("(?xs) (?!" + symbol + ")" +
|
wenzelm@31548
|
27 |
""" \\ < (?: (?! \s | [\"`\\] | \(\* | \*\) | \{\* | \*\} ) . )*""")
|
wenzelm@27923
|
28 |
|
wenzelm@27939
|
29 |
// total pattern
|
wenzelm@31537
|
30 |
val regex = new Regex(plain + "|" + symbol + "|" + bad_symbol + "| .")
|
wenzelm@27918
|
31 |
|
wenzelm@31537
|
32 |
// prefix of another symbol
|
wenzelm@34007
|
33 |
def is_open(s: CharSequence): Boolean =
|
wenzelm@31537
|
34 |
{
|
wenzelm@31537
|
35 |
val len = s.length
|
wenzelm@34007
|
36 |
len == 1 && Character.isHighSurrogate(s.charAt(0)) ||
|
wenzelm@31537
|
37 |
s == "\\" ||
|
wenzelm@31537
|
38 |
s == "\\<" ||
|
wenzelm@34007
|
39 |
len > 2 && s.charAt(len - 1) != '>'
|
wenzelm@31537
|
40 |
}
|
wenzelm@27901
|
41 |
|
wenzelm@27901
|
42 |
|
wenzelm@34007
|
43 |
/* elements */
|
wenzelm@31939
|
44 |
|
wenzelm@34007
|
45 |
private def could_open(c: Char): Boolean =
|
wenzelm@34007
|
46 |
c == '\\' || Character.isHighSurrogate(c)
|
wenzelm@34007
|
47 |
|
wenzelm@34010
|
48 |
def elements(text: CharSequence) = new Iterator[String] {
|
wenzelm@34007
|
49 |
private val matcher = regex.pattern.matcher(text)
|
wenzelm@34007
|
50 |
private var i = 0
|
wenzelm@34007
|
51 |
def hasNext = i < text.length
|
wenzelm@34007
|
52 |
def next = {
|
wenzelm@34007
|
53 |
val len =
|
wenzelm@34007
|
54 |
if (could_open(text.charAt(i))) {
|
wenzelm@34007
|
55 |
matcher.region(i, text.length).lookingAt
|
wenzelm@34007
|
56 |
matcher.group.length
|
wenzelm@34007
|
57 |
}
|
wenzelm@34007
|
58 |
else 1
|
wenzelm@34007
|
59 |
val s = text.subSequence(i, i + len)
|
wenzelm@34007
|
60 |
i += len
|
wenzelm@34010
|
61 |
s.toString
|
wenzelm@34007
|
62 |
}
|
wenzelm@34007
|
63 |
}
|
wenzelm@34007
|
64 |
|
wenzelm@34007
|
65 |
|
wenzelm@34007
|
66 |
/* decoding offsets */
|
wenzelm@34007
|
67 |
|
wenzelm@34007
|
68 |
class Index(text: CharSequence)
|
wenzelm@31939
|
69 |
{
|
wenzelm@31939
|
70 |
case class Entry(chr: Int, sym: Int)
|
wenzelm@31939
|
71 |
val index: Array[Entry] =
|
wenzelm@31939
|
72 |
{
|
wenzelm@31939
|
73 |
val matcher = regex.pattern.matcher(text)
|
wenzelm@31939
|
74 |
val buf = new mutable.ArrayBuffer[Entry]
|
wenzelm@31939
|
75 |
var chr = 0
|
wenzelm@31939
|
76 |
var sym = 0
|
wenzelm@34007
|
77 |
while (chr < text.length) {
|
wenzelm@31939
|
78 |
val len =
|
wenzelm@34007
|
79 |
if (could_open(text.charAt(chr))) {
|
wenzelm@34007
|
80 |
matcher.region(chr, text.length).lookingAt
|
wenzelm@31939
|
81 |
matcher.group.length
|
wenzelm@34007
|
82 |
}
|
wenzelm@34007
|
83 |
else 1
|
wenzelm@31939
|
84 |
chr += len
|
wenzelm@31939
|
85 |
sym += 1
|
wenzelm@31939
|
86 |
if (len > 1) buf += Entry(chr, sym)
|
wenzelm@31939
|
87 |
}
|
wenzelm@31939
|
88 |
buf.toArray
|
wenzelm@31939
|
89 |
}
|
wenzelm@31939
|
90 |
def decode(sym: Int): Int =
|
wenzelm@31939
|
91 |
{
|
wenzelm@31939
|
92 |
val end = index.length
|
wenzelm@31939
|
93 |
def bisect(a: Int, b: Int): Int =
|
wenzelm@31939
|
94 |
{
|
wenzelm@31939
|
95 |
if (a < b) {
|
wenzelm@31939
|
96 |
val c = (a + b) / 2
|
wenzelm@31939
|
97 |
if (sym < index(c).sym) bisect(a, c)
|
wenzelm@31939
|
98 |
else if (c + 1 == end || sym < index(c + 1).sym) c
|
wenzelm@31939
|
99 |
else bisect(c + 1, b)
|
wenzelm@31939
|
100 |
}
|
wenzelm@31939
|
101 |
else -1
|
wenzelm@31939
|
102 |
}
|
wenzelm@31939
|
103 |
val i = bisect(0, end)
|
wenzelm@31939
|
104 |
if (i < 0) sym
|
wenzelm@31939
|
105 |
else index(i).chr + sym - index(i).sym
|
wenzelm@31939
|
106 |
}
|
wenzelm@31939
|
107 |
}
|
wenzelm@31939
|
108 |
|
wenzelm@31939
|
109 |
|
wenzelm@34007
|
110 |
/* recoding text */
|
wenzelm@27924
|
111 |
|
wenzelm@31537
|
112 |
private class Recoder(list: List[(String, String)])
|
wenzelm@31537
|
113 |
{
|
wenzelm@31537
|
114 |
private val (min, max) =
|
wenzelm@31537
|
115 |
{
|
wenzelm@27937
|
116 |
var min = '\uffff'
|
wenzelm@27937
|
117 |
var max = '\u0000'
|
wenzelm@27937
|
118 |
for ((x, _) <- list) {
|
wenzelm@27937
|
119 |
val c = x(0)
|
wenzelm@27937
|
120 |
if (c < min) min = c
|
wenzelm@27937
|
121 |
if (c > max) max = c
|
wenzelm@27937
|
122 |
}
|
wenzelm@27937
|
123 |
(min, max)
|
wenzelm@27937
|
124 |
}
|
wenzelm@31537
|
125 |
private val table =
|
wenzelm@31537
|
126 |
{
|
wenzelm@31537
|
127 |
val table = new jcl.HashMap[String, String] // reasonably efficient?
|
wenzelm@27937
|
128 |
for ((x, y) <- list) table + (x -> y)
|
wenzelm@27927
|
129 |
table
|
wenzelm@27927
|
130 |
}
|
wenzelm@31537
|
131 |
def recode(text: String): String =
|
wenzelm@31537
|
132 |
{
|
wenzelm@27937
|
133 |
val len = text.length
|
wenzelm@31537
|
134 |
val matcher = regex.pattern.matcher(text)
|
wenzelm@27937
|
135 |
val result = new StringBuilder(len)
|
wenzelm@27937
|
136 |
var i = 0
|
wenzelm@27937
|
137 |
while (i < len) {
|
wenzelm@27937
|
138 |
val c = text(i)
|
wenzelm@27937
|
139 |
if (min <= c && c <= max) {
|
wenzelm@31939
|
140 |
matcher.region(i, len).lookingAt
|
wenzelm@27938
|
141 |
val x = matcher.group
|
wenzelm@31537
|
142 |
result.append(table.get(x) getOrElse x)
|
wenzelm@27937
|
143 |
i = matcher.end
|
wenzelm@27937
|
144 |
}
|
wenzelm@27937
|
145 |
else { result.append(c); i += 1 }
|
wenzelm@27937
|
146 |
}
|
wenzelm@27937
|
147 |
result.toString
|
wenzelm@27924
|
148 |
}
|
wenzelm@27924
|
149 |
}
|
wenzelm@27918
|
150 |
|
wenzelm@27918
|
151 |
|
wenzelm@27937
|
152 |
|
wenzelm@27937
|
153 |
/** Symbol interpretation **/
|
wenzelm@27937
|
154 |
|
wenzelm@29569
|
155 |
class Interpretation(symbol_decls: Iterator[String])
|
wenzelm@29569
|
156 |
{
|
wenzelm@31537
|
157 |
/* read symbols */
|
wenzelm@31537
|
158 |
|
wenzelm@31537
|
159 |
private val empty = new Regex("""(?xs) ^\s* (?: \#.* )? $ """)
|
wenzelm@31537
|
160 |
private val key = new Regex("""(?xs) (.+): """)
|
wenzelm@31537
|
161 |
|
wenzelm@31537
|
162 |
private def read_decl(decl: String): (String, Map[String, String]) =
|
wenzelm@31537
|
163 |
{
|
wenzelm@31537
|
164 |
def err() = error("Bad symbol declaration: " + decl)
|
wenzelm@31537
|
165 |
|
wenzelm@31537
|
166 |
def read_props(props: List[String]): Map[String, String] =
|
wenzelm@31537
|
167 |
{
|
wenzelm@31537
|
168 |
props match {
|
wenzelm@31537
|
169 |
case Nil => Map()
|
wenzelm@31537
|
170 |
case _ :: Nil => err()
|
wenzelm@31537
|
171 |
case key(x) :: y :: rest => read_props(rest) + (x -> y)
|
wenzelm@31537
|
172 |
case _ => err()
|
wenzelm@31537
|
173 |
}
|
wenzelm@31537
|
174 |
}
|
wenzelm@31537
|
175 |
decl.split("\\s+").toList match {
|
wenzelm@31537
|
176 |
case Nil => err()
|
wenzelm@31537
|
177 |
case sym :: props => (sym, read_props(props))
|
wenzelm@31537
|
178 |
}
|
wenzelm@31537
|
179 |
}
|
wenzelm@31537
|
180 |
|
wenzelm@31537
|
181 |
private val symbols: List[(String, Map[String, String])] =
|
wenzelm@31537
|
182 |
for (decl <- symbol_decls.toList if !empty.pattern.matcher(decl).matches)
|
wenzelm@31537
|
183 |
yield read_decl(decl)
|
wenzelm@31537
|
184 |
|
wenzelm@31537
|
185 |
|
wenzelm@31651
|
186 |
/* misc properties */
|
wenzelm@31651
|
187 |
|
wenzelm@31651
|
188 |
val names: Map[String, String] = {
|
wenzelm@31651
|
189 |
val name = new Regex("""\\<([A-Za-z][A-Za-z0-9_']*)>""")
|
wenzelm@31651
|
190 |
Map((for ((sym @ name(a), _) <- symbols) yield (sym -> a)): _*)
|
wenzelm@31651
|
191 |
}
|
wenzelm@31651
|
192 |
|
wenzelm@31651
|
193 |
val abbrevs: Map[String, String] = Map((
|
wenzelm@31651
|
194 |
for ((sym, props) <- symbols if props.isDefinedAt("abbrev"))
|
wenzelm@31651
|
195 |
yield (sym -> props("abbrev"))): _*)
|
wenzelm@31651
|
196 |
|
wenzelm@31651
|
197 |
|
wenzelm@31537
|
198 |
/* main recoder methods */
|
wenzelm@31537
|
199 |
|
wenzelm@31537
|
200 |
private val (decoder, encoder) =
|
wenzelm@31537
|
201 |
{
|
wenzelm@31537
|
202 |
val mapping =
|
wenzelm@31537
|
203 |
for {
|
wenzelm@31537
|
204 |
(sym, props) <- symbols
|
wenzelm@31537
|
205 |
val code =
|
wenzelm@31537
|
206 |
try { Integer.decode(props("code")).intValue }
|
wenzelm@31537
|
207 |
catch {
|
wenzelm@31537
|
208 |
case _: NoSuchElementException => error("Missing code for symbol " + sym)
|
wenzelm@31537
|
209 |
case _: NumberFormatException => error("Bad code for symbol " + sym)
|
wenzelm@31537
|
210 |
}
|
wenzelm@31537
|
211 |
val ch = new String(Character.toChars(code))
|
wenzelm@31537
|
212 |
} yield (sym, ch)
|
wenzelm@31548
|
213 |
(new Recoder(mapping),
|
wenzelm@31551
|
214 |
new Recoder(mapping map { case (x, y) => (y, x) }))
|
wenzelm@31537
|
215 |
}
|
wenzelm@27918
|
216 |
|
wenzelm@34104
|
217 |
def decode(text: String): String = decoder.recode(text)
|
wenzelm@34104
|
218 |
def encode(text: String): String = encoder.recode(text)
|
wenzelm@27918
|
219 |
}
|
wenzelm@27901
|
220 |
}
|