src/Pure/General/scan.scala
author wenzelm
Thu, 07 Jul 2011 13:48:30 +0200
changeset 44569 5130dfe1b7be
parent 44294 a26e514c92b2
child 44570 58bb7ca5c7a2
permissions -rw-r--r--
simplified Symbol based on lazy Symbol.Interpretation -- reduced odd "functorial style";
tuned implicit build/init messages;
     1 /*  Title:      Pure/General/scan.scala
     2     Author:     Makarius
     3 
     4 Efficient scanning of keywords and tokens.
     5 */
     6 
     7 package isabelle
     8 
     9 
    10 import scala.collection.generic.Addable
    11 import scala.collection.IndexedSeq
    12 import scala.collection.immutable.PagedSeq
    13 import scala.util.parsing.input.{OffsetPosition, Position => InputPosition, Reader}
    14 import scala.util.parsing.combinator.RegexParsers
    15 
    16 import java.io.{File, InputStream, BufferedInputStream, FileInputStream}
    17 
    18 
    19 object Scan
    20 {
    21   /** context of partial scans **/
    22 
    23   sealed abstract class Context
    24   case object Finished extends Context
    25   case class Quoted(quote: String) extends Context
    26   case object Verbatim extends Context
    27   case class Comment(depth: Int) extends Context
    28 
    29 
    30 
    31   /** Lexicon -- position tree **/
    32 
    33   object Lexicon
    34   {
    35     protected case class Tree(val branches: Map[Char, (String, Tree)])
    36     private val empty_tree = Tree(Map())
    37 
    38     val empty: Lexicon = new Lexicon
    39     def apply(elems: String*): Lexicon = empty ++ elems
    40   }
    41 
    42   class Lexicon extends Addable[String, Lexicon] with RegexParsers
    43   {
    44     /* representation */
    45 
    46     import Lexicon.Tree
    47     protected val main_tree: Tree = Lexicon.empty_tree
    48 
    49 
    50     /* auxiliary operations */
    51 
    52     private def content(tree: Tree, result: List[String]): List[String] =
    53       (result /: tree.branches.toList) ((res, entry) =>
    54         entry match { case (_, (s, tr)) =>
    55           if (s.isEmpty) content(tr, res) else content(tr, s :: res) })
    56 
    57     private def lookup(str: CharSequence): Option[(Boolean, Tree)] =
    58     {
    59       val len = str.length
    60       def look(tree: Tree, tip: Boolean, i: Int): Option[(Boolean, Tree)] =
    61       {
    62         if (i < len) {
    63           tree.branches.get(str.charAt(i)) match {
    64             case Some((s, tr)) => look(tr, !s.isEmpty, i + 1)
    65             case None => None
    66           }
    67         } else Some(tip, tree)
    68       }
    69       look(main_tree, false, 0)
    70     }
    71 
    72     def completions(str: CharSequence): List[String] =
    73       lookup(str) match {
    74         case Some((true, tree)) => content(tree, List(str.toString))
    75         case Some((false, tree)) => content(tree, Nil)
    76         case None => Nil
    77       }
    78 
    79 
    80     /* pseudo Set methods */
    81 
    82     def iterator: Iterator[String] = content(main_tree, Nil).sortWith(_ <= _).iterator
    83 
    84     override def toString: String = iterator.mkString("Lexicon(", ", ", ")")
    85 
    86     def empty: Lexicon = Lexicon.empty
    87     def isEmpty: Boolean = main_tree.branches.isEmpty
    88 
    89     def contains(elem: String): Boolean =
    90       lookup(elem) match {
    91         case Some((tip, _)) => tip
    92         case _ => false
    93       }
    94 
    95 
    96     /* Addable methods */
    97 
    98     def repr = this
    99 
   100     def + (elem: String): Lexicon =
   101       if (contains(elem)) this
   102       else {
   103         val len = elem.length
   104         def extend(tree: Tree, i: Int): Tree =
   105           if (i < len) {
   106             val c = elem.charAt(i)
   107             val end = (i + 1 == len)
   108             tree.branches.get(c) match {
   109               case Some((s, tr)) =>
   110                 Tree(tree.branches +
   111                   (c -> (if (end) elem else s, extend(tr, i + 1))))
   112               case None =>
   113                 Tree(tree.branches +
   114                   (c -> (if (end) elem else "", extend(Lexicon.empty_tree, i + 1))))
   115             }
   116           }
   117           else tree
   118         val old = this
   119         new Lexicon { override val main_tree = extend(old.main_tree, 0) }
   120       }
   121 
   122 
   123 
   124     /** RegexParsers methods **/
   125 
   126     override val whiteSpace = "".r
   127 
   128 
   129     /* optional termination */
   130 
   131     def opt_term[T](p: => Parser[T]): Parser[Option[T]] =
   132       p ^^ (x => Some(x)) | """\z""".r ^^ (_ => None)
   133 
   134 
   135     /* keywords from lexicon */
   136 
   137     def keyword: Parser[String] = new Parser[String]
   138     {
   139       def apply(in: Input) =
   140       {
   141         val source = in.source
   142         val offset = in.offset
   143         val len = source.length - offset
   144 
   145         def scan(tree: Tree, result: String, i: Int): String =
   146         {
   147           if (i < len) {
   148             tree.branches.get(source.charAt(offset + i)) match {
   149               case Some((s, tr)) => scan(tr, if (s.isEmpty) result else s, i + 1)
   150               case None => result
   151             }
   152           }
   153           else result
   154         }
   155         val result = scan(main_tree, "", 0)
   156         if (result.isEmpty) Failure("keyword expected", in)
   157         else Success(result, in.drop(result.length))
   158       }
   159     }.named("keyword")
   160 
   161 
   162     /* symbol range */
   163 
   164     def symbol_range(pred: String => Boolean, min_count: Int, max_count: Int): Parser[String] =
   165       new Parser[String]
   166       {
   167         def apply(in: Input) =
   168         {
   169           val start = in.offset
   170           val end = in.source.length
   171           val matcher = new Symbol.Matcher(in.source)
   172 
   173           var i = start
   174           var count = 0
   175           var finished = false
   176           while (!finished) {
   177             if (i < end && count < max_count) {
   178               val n = matcher(i, end)
   179               val sym = in.source.subSequence(i, i + n).toString
   180               if (pred(sym)) { i += n; count += 1 }
   181               else finished = true
   182             }
   183             else finished = true
   184           }
   185           if (count < min_count) Failure("bad input", in)
   186           else Success(in.source.subSequence(start, i).toString, in.drop(i - start))
   187         }
   188       }.named("symbol_range")
   189 
   190     def one(pred: String => Boolean): Parser[String] = symbol_range(pred, 1, 1)
   191     def many(pred: String => Boolean): Parser[String] = symbol_range(pred, 0, Integer.MAX_VALUE)
   192     def many1(pred: String => Boolean): Parser[String] = symbol_range(pred, 1, Integer.MAX_VALUE)
   193 
   194 
   195     /* quoted strings */
   196 
   197     private def quoted_body(quote: String): Parser[String] =
   198     {
   199       rep(many1(sym => sym != quote && sym != "\\") | "\\" + quote | "\\\\" |
   200         (("""\\\d\d\d""".r) ^? { case x if x.substring(1, 4).toInt <= 255 => x })) ^^ (_.mkString)
   201     }
   202 
   203     private def quoted(quote: String): Parser[String] =
   204     {
   205       quote ~ quoted_body(quote) ~ quote ^^ { case x ~ y ~ z => x + y + z }
   206     }.named("quoted")
   207 
   208     def quoted_content(quote: String, source: String): String =
   209     {
   210       require(parseAll(quoted(quote), source).successful)
   211       val body = source.substring(1, source.length - 1)
   212       if (body.exists(_ == '\\')) {
   213         val content =
   214           rep(many1(sym => sym != quote && sym != "\\") |
   215               "\\" ~> (quote | "\\" | """\d\d\d""".r ^^ { case x => x.toInt.toChar.toString }))
   216         parseAll(content ^^ (_.mkString), body).get
   217       }
   218       else body
   219     }
   220 
   221     def quoted_context(quote: String, ctxt: Context): Parser[(String, Context)] =
   222     {
   223       ctxt match {
   224         case Finished =>
   225           quote ~ quoted_body(quote) ~ opt_term(quote) ^^
   226             { case x ~ y ~ Some(z) => (x + y + z, Finished)
   227               case x ~ y ~ None => (x + y, Quoted(quote)) }
   228         case Quoted(q) if q == quote =>
   229           quoted_body(quote) ~ opt_term(quote) ^^
   230             { case x ~ Some(y) => (x + y, Finished)
   231               case x ~ None => (x, ctxt) }
   232         case _ => failure("")
   233       }
   234     }.named("quoted_context")
   235 
   236 
   237     /* verbatim text */
   238 
   239     private def verbatim_body: Parser[String] =
   240       rep(many1(sym => sym != "*") | """\*(?!\})""".r) ^^ (_.mkString)
   241 
   242     private def verbatim: Parser[String] =
   243     {
   244       "{*" ~ verbatim_body ~ "*}" ^^ { case x ~ y ~ z => x + y + z }
   245     }.named("verbatim")
   246 
   247     def verbatim_content(source: String): String =
   248     {
   249       require(parseAll(verbatim, source).successful)
   250       source.substring(2, source.length - 2)
   251     }
   252 
   253     def verbatim_context(ctxt: Context): Parser[(String, Context)] =
   254     {
   255       ctxt match {
   256         case Finished =>
   257           "{*" ~ verbatim_body ~ opt_term("*}") ^^
   258             { case x ~ y ~ Some(z) => (x + y + z, Finished)
   259               case x ~ y ~ None => (x + y, Verbatim) }
   260         case Verbatim =>
   261           verbatim_body ~ opt_term("*}") ^^
   262             { case x ~ Some(y) => (x + y, Finished)
   263               case x ~ None => (x, Verbatim) }
   264         case _ => failure("")
   265       }
   266     }.named("verbatim_context")
   267 
   268 
   269     /* nested comments */
   270 
   271     private def comment_depth(depth: Int): Parser[(String, Int)] = new Parser[(String, Int)]
   272     {
   273       require(depth >= 0)
   274 
   275       val comment_text =
   276         rep1(many1(sym => sym != "*" && sym != "(") | """\*(?!\))|\((?!\*)""".r)
   277 
   278       def apply(in: Input) =
   279       {
   280         var rest = in
   281         def try_parse[A](p: Parser[A]): Boolean =
   282         {
   283           parse(p ^^^ (), rest) match {
   284             case Success(_, next) => { rest = next; true }
   285             case _ => false
   286           }
   287         }
   288         var d = depth
   289         var finished = false
   290         while (!finished) {
   291           if (try_parse("(*")) d += 1
   292           else if (d > 0 && try_parse("*)")) d -= 1
   293           else if (d == 0 || !try_parse(comment_text)) finished = true
   294         }
   295         if (in.offset < rest.offset)
   296           Success((in.source.subSequence(in.offset, rest.offset).toString, d), rest)
   297         else Failure("comment expected", in)
   298       }
   299     }.named("comment_depth")
   300 
   301     def comment: Parser[String] =
   302       comment_depth(0) ^? { case (x, d) if d == 0 => x }
   303 
   304     def comment_context(ctxt: Context): Parser[(String, Context)] =
   305     {
   306       val depth =
   307         ctxt match {
   308           case Finished => 0
   309           case Comment(d) => d
   310           case _ => -1
   311         }
   312       if (depth >= 0)
   313         comment_depth(depth) ^^
   314           { case (x, 0) => (x, Finished)
   315             case (x, d) => (x, Comment(d)) }
   316       else failure("")
   317     }
   318 
   319     def comment_content(source: String): String =
   320     {
   321       require(parseAll(comment, source).successful)
   322       source.substring(2, source.length - 2)
   323     }
   324 
   325 
   326     /* outer syntax tokens */
   327 
   328     private def delimited_token: Parser[Token] =
   329     {
   330       val string = quoted("\"") ^^ (x => Token(Token.Kind.STRING, x))
   331       val alt_string = quoted("`") ^^ (x => Token(Token.Kind.ALT_STRING, x))
   332       val verb = verbatim ^^ (x => Token(Token.Kind.VERBATIM, x))
   333       val cmt = comment ^^ (x => Token(Token.Kind.COMMENT, x))
   334 
   335       string | (alt_string | (verb | cmt))
   336     }
   337 
   338     private def other_token(is_command: String => Boolean)
   339       : Parser[Token] =
   340     {
   341       val id = one(Symbol.is_letter) ~ many(Symbol.is_letdig) ^^ { case x ~ y => x + y }
   342       val nat = many1(Symbol.is_digit)
   343       val natdot = nat ~ "." ~ nat ^^ { case x ~ y ~ z => x + y + z }
   344       val id_nat = id ~ opt("." ~ nat) ^^ { case x ~ Some(y ~ z) => x + y + z case x ~ None => x }
   345 
   346       val ident = id ~ rep("." ~> id) ^^
   347         { case x ~ Nil => Token(Token.Kind.IDENT, x)
   348           case x ~ ys => Token(Token.Kind.LONG_IDENT, (x :: ys).mkString(".")) }
   349 
   350       val var_ = "?" ~ id_nat ^^ { case x ~ y => Token(Token.Kind.VAR, x + y) }
   351       val type_ident = "'" ~ id ^^ { case x ~ y => Token(Token.Kind.TYPE_IDENT, x + y) }
   352       val type_var = "?'" ~ id_nat ^^ { case x ~ y => Token(Token.Kind.TYPE_VAR, x + y) }
   353       val nat_ = nat ^^ (x => Token(Token.Kind.NAT, x))
   354       val float =
   355         ("-" ~ natdot ^^ { case x ~ y => x + y } | natdot) ^^ (x => Token(Token.Kind.FLOAT, x))
   356 
   357       val sym_ident =
   358         (many1(Symbol.is_symbolic_char) | one(sym => Symbol.is_symbolic(sym))) ^^
   359         (x => Token(Token.Kind.SYM_IDENT, x))
   360 
   361       val space = many1(Symbol.is_blank) ^^ (x => Token(Token.Kind.SPACE, x))
   362 
   363       // FIXME check
   364       val junk = many(sym => !(Symbol.is_blank(sym)))
   365       val junk1 = many1(sym => !(Symbol.is_blank(sym)))
   366 
   367       val bad_delimiter =
   368         ("\"" | "`" | "{*" | "(*") ~ junk ^^ { case x ~ y => Token(Token.Kind.UNPARSED, x + y) }
   369       val bad = junk1 ^^ (x => Token(Token.Kind.UNPARSED, x))
   370 
   371       val command_keyword =
   372         keyword ^^ (x => Token(if (is_command(x)) Token.Kind.COMMAND else Token.Kind.KEYWORD, x))
   373 
   374       space | (bad_delimiter |
   375         (((ident | (var_ | (type_ident | (type_var | (float | (nat_ | sym_ident)))))) |||
   376           command_keyword) | bad))
   377     }
   378 
   379     def token(is_command: String => Boolean): Parser[Token] =
   380       delimited_token | other_token(is_command)
   381 
   382     def token_context(is_command: String => Boolean, ctxt: Context): Parser[(Token, Context)] =
   383     {
   384       val string =
   385         quoted_context("\"", ctxt) ^^ { case (x, c) => (Token(Token.Kind.STRING, x), c) }
   386       val alt_string =
   387         quoted_context("`", ctxt) ^^ { case (x, c) => (Token(Token.Kind.ALT_STRING, x), c) }
   388       val verb = verbatim_context(ctxt) ^^ { case (x, c) => (Token(Token.Kind.VERBATIM, x), c) }
   389       val cmt = comment_context(ctxt) ^^ { case (x, c) => (Token(Token.Kind.COMMENT, x), c) }
   390       val other = other_token(is_command) ^^ { case x => (x, Finished) }
   391 
   392       string | (alt_string | (verb | (cmt | other)))
   393     }
   394   }
   395 
   396 
   397 
   398   /** read file without decoding -- enables efficient length operation **/
   399 
   400   private class Restricted_Seq(seq: IndexedSeq[Char], start: Int, end: Int)
   401     extends CharSequence
   402   {
   403     def charAt(i: Int): Char =
   404       if (0 <= i && i < length) seq(start + i)
   405       else throw new IndexOutOfBoundsException
   406 
   407     def length: Int = end - start  // avoid potentially expensive seq.length
   408 
   409     def subSequence(i: Int, j: Int): CharSequence =
   410       if (0 <= i && i <= j && j <= length) new Restricted_Seq(seq, start + i, start + j)
   411       else throw new IndexOutOfBoundsException
   412 
   413     override def toString: String =
   414     {
   415       val buf = new StringBuilder(length)
   416       for (offset <- start until end) buf.append(seq(offset))
   417       buf.toString
   418     }
   419   }
   420 
   421   abstract class Byte_Reader extends Reader[Char] { def close: Unit }
   422 
   423   def byte_reader(file: File): Byte_Reader =
   424   {
   425     val stream = new BufferedInputStream(new FileInputStream(file))
   426     val seq = new PagedSeq(
   427       (buf: Array[Char], offset: Int, length: Int) =>
   428         {
   429           var i = 0
   430           var c = 0
   431           var eof = false
   432           while (!eof && i < length) {
   433             c = stream.read
   434             if (c == -1) eof = true
   435             else { buf(offset + i) = c.toChar; i += 1 }
   436           }
   437           if (i > 0) i else -1
   438         })
   439     val restricted_seq = new Restricted_Seq(seq, 0, file.length.toInt)
   440 
   441     class Paged_Reader(override val offset: Int) extends Byte_Reader
   442     {
   443       override lazy val source: CharSequence = restricted_seq
   444       def first: Char = if (seq.isDefinedAt(offset)) seq(offset) else '\032'
   445       def rest: Paged_Reader = if (seq.isDefinedAt(offset)) new Paged_Reader(offset + 1) else this
   446       def pos: InputPosition = new OffsetPosition(source, offset)
   447       def atEnd: Boolean = !seq.isDefinedAt(offset)
   448       override def drop(n: Int): Paged_Reader = new Paged_Reader(offset + n)
   449       def close { stream.close }
   450     }
   451     new Paged_Reader(0)
   452   }
   453 }