src/Pure/General/scan.scala
author wenzelm
Thu, 07 Jul 2011 14:10:50 +0200
changeset 44570 58bb7ca5c7a2
parent 44569 5130dfe1b7be
child 46123 6de58d947e57
permissions -rw-r--r--
explicit indication of type Symbol.Symbol;
     1 /*  Title:      Pure/General/scan.scala
     2     Author:     Makarius
     3 
     4 Efficient scanning of keywords and tokens.
     5 */
     6 
     7 package isabelle
     8 
     9 
    10 import scala.collection.generic.Addable
    11 import scala.collection.IndexedSeq
    12 import scala.collection.immutable.PagedSeq
    13 import scala.util.parsing.input.{OffsetPosition, Position => InputPosition, Reader}
    14 import scala.util.parsing.combinator.RegexParsers
    15 
    16 import java.io.{File, InputStream, BufferedInputStream, FileInputStream}
    17 
    18 
    19 object Scan
    20 {
    21   /** context of partial scans **/
    22 
    23   sealed abstract class Context
    24   case object Finished extends Context
    25   case class Quoted(quote: String) extends Context
    26   case object Verbatim extends Context
    27   case class Comment(depth: Int) extends Context
    28 
    29 
    30 
    31   /** Lexicon -- position tree **/
    32 
    33   object Lexicon
    34   {
    35     protected case class Tree(val branches: Map[Char, (String, Tree)])
    36     private val empty_tree = Tree(Map())
    37 
    38     val empty: Lexicon = new Lexicon
    39     def apply(elems: String*): Lexicon = empty ++ elems
    40   }
    41 
    42   class Lexicon extends Addable[String, Lexicon] with RegexParsers
    43   {
    44     /* representation */
    45 
    46     import Lexicon.Tree
    47     protected val main_tree: Tree = Lexicon.empty_tree
    48 
    49 
    50     /* auxiliary operations */
    51 
    52     private def content(tree: Tree, result: List[String]): List[String] =
    53       (result /: tree.branches.toList) ((res, entry) =>
    54         entry match { case (_, (s, tr)) =>
    55           if (s.isEmpty) content(tr, res) else content(tr, s :: res) })
    56 
    57     private def lookup(str: CharSequence): Option[(Boolean, Tree)] =
    58     {
    59       val len = str.length
    60       def look(tree: Tree, tip: Boolean, i: Int): Option[(Boolean, Tree)] =
    61       {
    62         if (i < len) {
    63           tree.branches.get(str.charAt(i)) match {
    64             case Some((s, tr)) => look(tr, !s.isEmpty, i + 1)
    65             case None => None
    66           }
    67         } else Some(tip, tree)
    68       }
    69       look(main_tree, false, 0)
    70     }
    71 
    72     def completions(str: CharSequence): List[String] =
    73       lookup(str) match {
    74         case Some((true, tree)) => content(tree, List(str.toString))
    75         case Some((false, tree)) => content(tree, Nil)
    76         case None => Nil
    77       }
    78 
    79 
    80     /* pseudo Set methods */
    81 
    82     def iterator: Iterator[String] = content(main_tree, Nil).sortWith(_ <= _).iterator
    83 
    84     override def toString: String = iterator.mkString("Lexicon(", ", ", ")")
    85 
    86     def empty: Lexicon = Lexicon.empty
    87     def isEmpty: Boolean = main_tree.branches.isEmpty
    88 
    89     def contains(elem: String): Boolean =
    90       lookup(elem) match {
    91         case Some((tip, _)) => tip
    92         case _ => false
    93       }
    94 
    95 
    96     /* Addable methods */
    97 
    98     def repr = this
    99 
   100     def + (elem: String): Lexicon =
   101       if (contains(elem)) this
   102       else {
   103         val len = elem.length
   104         def extend(tree: Tree, i: Int): Tree =
   105           if (i < len) {
   106             val c = elem.charAt(i)
   107             val end = (i + 1 == len)
   108             tree.branches.get(c) match {
   109               case Some((s, tr)) =>
   110                 Tree(tree.branches +
   111                   (c -> (if (end) elem else s, extend(tr, i + 1))))
   112               case None =>
   113                 Tree(tree.branches +
   114                   (c -> (if (end) elem else "", extend(Lexicon.empty_tree, i + 1))))
   115             }
   116           }
   117           else tree
   118         val old = this
   119         new Lexicon { override val main_tree = extend(old.main_tree, 0) }
   120       }
   121 
   122 
   123 
   124     /** RegexParsers methods **/
   125 
   126     override val whiteSpace = "".r
   127 
   128 
   129     /* optional termination */
   130 
   131     def opt_term[T](p: => Parser[T]): Parser[Option[T]] =
   132       p ^^ (x => Some(x)) | """\z""".r ^^ (_ => None)
   133 
   134 
   135     /* keywords from lexicon */
   136 
   137     def keyword: Parser[String] = new Parser[String]
   138     {
   139       def apply(in: Input) =
   140       {
   141         val source = in.source
   142         val offset = in.offset
   143         val len = source.length - offset
   144 
   145         def scan(tree: Tree, result: String, i: Int): String =
   146         {
   147           if (i < len) {
   148             tree.branches.get(source.charAt(offset + i)) match {
   149               case Some((s, tr)) => scan(tr, if (s.isEmpty) result else s, i + 1)
   150               case None => result
   151             }
   152           }
   153           else result
   154         }
   155         val result = scan(main_tree, "", 0)
   156         if (result.isEmpty) Failure("keyword expected", in)
   157         else Success(result, in.drop(result.length))
   158       }
   159     }.named("keyword")
   160 
   161 
   162     /* symbol range */
   163 
   164     def symbol_range(pred: Symbol.Symbol => Boolean, min_count: Int, max_count: Int): Parser[String] =
   165       new Parser[String]
   166       {
   167         def apply(in: Input) =
   168         {
   169           val start = in.offset
   170           val end = in.source.length
   171           val matcher = new Symbol.Matcher(in.source)
   172 
   173           var i = start
   174           var count = 0
   175           var finished = false
   176           while (!finished) {
   177             if (i < end && count < max_count) {
   178               val n = matcher(i, end)
   179               val sym = in.source.subSequence(i, i + n).toString
   180               if (pred(sym)) { i += n; count += 1 }
   181               else finished = true
   182             }
   183             else finished = true
   184           }
   185           if (count < min_count) Failure("bad input", in)
   186           else Success(in.source.subSequence(start, i).toString, in.drop(i - start))
   187         }
   188       }.named("symbol_range")
   189 
   190     def one(pred: Symbol.Symbol => Boolean): Parser[String] =
   191       symbol_range(pred, 1, 1)
   192 
   193     def many(pred: Symbol.Symbol => Boolean): Parser[String] =
   194       symbol_range(pred, 0, Integer.MAX_VALUE)
   195 
   196     def many1(pred: Symbol.Symbol => Boolean): Parser[String] =
   197       symbol_range(pred, 1, Integer.MAX_VALUE)
   198 
   199 
   200     /* quoted strings */
   201 
   202     private def quoted_body(quote: Symbol.Symbol): Parser[String] =
   203     {
   204       rep(many1(sym => sym != quote && sym != "\\") | "\\" + quote | "\\\\" |
   205         (("""\\\d\d\d""".r) ^? { case x if x.substring(1, 4).toInt <= 255 => x })) ^^ (_.mkString)
   206     }
   207 
   208     private def quoted(quote: Symbol.Symbol): Parser[String] =
   209     {
   210       quote ~ quoted_body(quote) ~ quote ^^ { case x ~ y ~ z => x + y + z }
   211     }.named("quoted")
   212 
   213     def quoted_content(quote: Symbol.Symbol, source: String): String =
   214     {
   215       require(parseAll(quoted(quote), source).successful)
   216       val body = source.substring(1, source.length - 1)
   217       if (body.exists(_ == '\\')) {
   218         val content =
   219           rep(many1(sym => sym != quote && sym != "\\") |
   220               "\\" ~> (quote | "\\" | """\d\d\d""".r ^^ { case x => x.toInt.toChar.toString }))
   221         parseAll(content ^^ (_.mkString), body).get
   222       }
   223       else body
   224     }
   225 
   226     def quoted_context(quote: Symbol.Symbol, ctxt: Context): Parser[(String, Context)] =
   227     {
   228       ctxt match {
   229         case Finished =>
   230           quote ~ quoted_body(quote) ~ opt_term(quote) ^^
   231             { case x ~ y ~ Some(z) => (x + y + z, Finished)
   232               case x ~ y ~ None => (x + y, Quoted(quote)) }
   233         case Quoted(q) if q == quote =>
   234           quoted_body(quote) ~ opt_term(quote) ^^
   235             { case x ~ Some(y) => (x + y, Finished)
   236               case x ~ None => (x, ctxt) }
   237         case _ => failure("")
   238       }
   239     }.named("quoted_context")
   240 
   241 
   242     /* verbatim text */
   243 
   244     private def verbatim_body: Parser[String] =
   245       rep(many1(sym => sym != "*") | """\*(?!\})""".r) ^^ (_.mkString)
   246 
   247     private def verbatim: Parser[String] =
   248     {
   249       "{*" ~ verbatim_body ~ "*}" ^^ { case x ~ y ~ z => x + y + z }
   250     }.named("verbatim")
   251 
   252     def verbatim_content(source: String): String =
   253     {
   254       require(parseAll(verbatim, source).successful)
   255       source.substring(2, source.length - 2)
   256     }
   257 
   258     def verbatim_context(ctxt: Context): Parser[(String, Context)] =
   259     {
   260       ctxt match {
   261         case Finished =>
   262           "{*" ~ verbatim_body ~ opt_term("*}") ^^
   263             { case x ~ y ~ Some(z) => (x + y + z, Finished)
   264               case x ~ y ~ None => (x + y, Verbatim) }
   265         case Verbatim =>
   266           verbatim_body ~ opt_term("*}") ^^
   267             { case x ~ Some(y) => (x + y, Finished)
   268               case x ~ None => (x, Verbatim) }
   269         case _ => failure("")
   270       }
   271     }.named("verbatim_context")
   272 
   273 
   274     /* nested comments */
   275 
   276     private def comment_depth(depth: Int): Parser[(String, Int)] = new Parser[(String, Int)]
   277     {
   278       require(depth >= 0)
   279 
   280       val comment_text =
   281         rep1(many1(sym => sym != "*" && sym != "(") | """\*(?!\))|\((?!\*)""".r)
   282 
   283       def apply(in: Input) =
   284       {
   285         var rest = in
   286         def try_parse[A](p: Parser[A]): Boolean =
   287         {
   288           parse(p ^^^ (), rest) match {
   289             case Success(_, next) => { rest = next; true }
   290             case _ => false
   291           }
   292         }
   293         var d = depth
   294         var finished = false
   295         while (!finished) {
   296           if (try_parse("(*")) d += 1
   297           else if (d > 0 && try_parse("*)")) d -= 1
   298           else if (d == 0 || !try_parse(comment_text)) finished = true
   299         }
   300         if (in.offset < rest.offset)
   301           Success((in.source.subSequence(in.offset, rest.offset).toString, d), rest)
   302         else Failure("comment expected", in)
   303       }
   304     }.named("comment_depth")
   305 
   306     def comment: Parser[String] =
   307       comment_depth(0) ^? { case (x, d) if d == 0 => x }
   308 
   309     def comment_context(ctxt: Context): Parser[(String, Context)] =
   310     {
   311       val depth =
   312         ctxt match {
   313           case Finished => 0
   314           case Comment(d) => d
   315           case _ => -1
   316         }
   317       if (depth >= 0)
   318         comment_depth(depth) ^^
   319           { case (x, 0) => (x, Finished)
   320             case (x, d) => (x, Comment(d)) }
   321       else failure("")
   322     }
   323 
   324     def comment_content(source: String): String =
   325     {
   326       require(parseAll(comment, source).successful)
   327       source.substring(2, source.length - 2)
   328     }
   329 
   330 
   331     /* outer syntax tokens */
   332 
   333     private def delimited_token: Parser[Token] =
   334     {
   335       val string = quoted("\"") ^^ (x => Token(Token.Kind.STRING, x))
   336       val alt_string = quoted("`") ^^ (x => Token(Token.Kind.ALT_STRING, x))
   337       val verb = verbatim ^^ (x => Token(Token.Kind.VERBATIM, x))
   338       val cmt = comment ^^ (x => Token(Token.Kind.COMMENT, x))
   339 
   340       string | (alt_string | (verb | cmt))
   341     }
   342 
   343     private def other_token(is_command: String => Boolean)
   344       : Parser[Token] =
   345     {
   346       val id = one(Symbol.is_letter) ~ many(Symbol.is_letdig) ^^ { case x ~ y => x + y }
   347       val nat = many1(Symbol.is_digit)
   348       val natdot = nat ~ "." ~ nat ^^ { case x ~ y ~ z => x + y + z }
   349       val id_nat = id ~ opt("." ~ nat) ^^ { case x ~ Some(y ~ z) => x + y + z case x ~ None => x }
   350 
   351       val ident = id ~ rep("." ~> id) ^^
   352         { case x ~ Nil => Token(Token.Kind.IDENT, x)
   353           case x ~ ys => Token(Token.Kind.LONG_IDENT, (x :: ys).mkString(".")) }
   354 
   355       val var_ = "?" ~ id_nat ^^ { case x ~ y => Token(Token.Kind.VAR, x + y) }
   356       val type_ident = "'" ~ id ^^ { case x ~ y => Token(Token.Kind.TYPE_IDENT, x + y) }
   357       val type_var = "?'" ~ id_nat ^^ { case x ~ y => Token(Token.Kind.TYPE_VAR, x + y) }
   358       val nat_ = nat ^^ (x => Token(Token.Kind.NAT, x))
   359       val float =
   360         ("-" ~ natdot ^^ { case x ~ y => x + y } | natdot) ^^ (x => Token(Token.Kind.FLOAT, x))
   361 
   362       val sym_ident =
   363         (many1(Symbol.is_symbolic_char) | one(sym => Symbol.is_symbolic(sym))) ^^
   364         (x => Token(Token.Kind.SYM_IDENT, x))
   365 
   366       val space = many1(Symbol.is_blank) ^^ (x => Token(Token.Kind.SPACE, x))
   367 
   368       // FIXME check
   369       val junk = many(sym => !(Symbol.is_blank(sym)))
   370       val junk1 = many1(sym => !(Symbol.is_blank(sym)))
   371 
   372       val bad_delimiter =
   373         ("\"" | "`" | "{*" | "(*") ~ junk ^^ { case x ~ y => Token(Token.Kind.UNPARSED, x + y) }
   374       val bad = junk1 ^^ (x => Token(Token.Kind.UNPARSED, x))
   375 
   376       val command_keyword =
   377         keyword ^^ (x => Token(if (is_command(x)) Token.Kind.COMMAND else Token.Kind.KEYWORD, x))
   378 
   379       space | (bad_delimiter |
   380         (((ident | (var_ | (type_ident | (type_var | (float | (nat_ | sym_ident)))))) |||
   381           command_keyword) | bad))
   382     }
   383 
   384     def token(is_command: String => Boolean): Parser[Token] =
   385       delimited_token | other_token(is_command)
   386 
   387     def token_context(is_command: String => Boolean, ctxt: Context): Parser[(Token, Context)] =
   388     {
   389       val string =
   390         quoted_context("\"", ctxt) ^^ { case (x, c) => (Token(Token.Kind.STRING, x), c) }
   391       val alt_string =
   392         quoted_context("`", ctxt) ^^ { case (x, c) => (Token(Token.Kind.ALT_STRING, x), c) }
   393       val verb = verbatim_context(ctxt) ^^ { case (x, c) => (Token(Token.Kind.VERBATIM, x), c) }
   394       val cmt = comment_context(ctxt) ^^ { case (x, c) => (Token(Token.Kind.COMMENT, x), c) }
   395       val other = other_token(is_command) ^^ { case x => (x, Finished) }
   396 
   397       string | (alt_string | (verb | (cmt | other)))
   398     }
   399   }
   400 
   401 
   402 
   403   /** read file without decoding -- enables efficient length operation **/
   404 
   405   private class Restricted_Seq(seq: IndexedSeq[Char], start: Int, end: Int)
   406     extends CharSequence
   407   {
   408     def charAt(i: Int): Char =
   409       if (0 <= i && i < length) seq(start + i)
   410       else throw new IndexOutOfBoundsException
   411 
   412     def length: Int = end - start  // avoid potentially expensive seq.length
   413 
   414     def subSequence(i: Int, j: Int): CharSequence =
   415       if (0 <= i && i <= j && j <= length) new Restricted_Seq(seq, start + i, start + j)
   416       else throw new IndexOutOfBoundsException
   417 
   418     override def toString: String =
   419     {
   420       val buf = new StringBuilder(length)
   421       for (offset <- start until end) buf.append(seq(offset))
   422       buf.toString
   423     }
   424   }
   425 
   426   abstract class Byte_Reader extends Reader[Char] { def close: Unit }
   427 
   428   def byte_reader(file: File): Byte_Reader =
   429   {
   430     val stream = new BufferedInputStream(new FileInputStream(file))
   431     val seq = new PagedSeq(
   432       (buf: Array[Char], offset: Int, length: Int) =>
   433         {
   434           var i = 0
   435           var c = 0
   436           var eof = false
   437           while (!eof && i < length) {
   438             c = stream.read
   439             if (c == -1) eof = true
   440             else { buf(offset + i) = c.toChar; i += 1 }
   441           }
   442           if (i > 0) i else -1
   443         })
   444     val restricted_seq = new Restricted_Seq(seq, 0, file.length.toInt)
   445 
   446     class Paged_Reader(override val offset: Int) extends Byte_Reader
   447     {
   448       override lazy val source: CharSequence = restricted_seq
   449       def first: Char = if (seq.isDefinedAt(offset)) seq(offset) else '\032'
   450       def rest: Paged_Reader = if (seq.isDefinedAt(offset)) new Paged_Reader(offset + 1) else this
   451       def pos: InputPosition = new OffsetPosition(source, offset)
   452       def atEnd: Boolean = !seq.isDefinedAt(offset)
   453       override def drop(n: Int): Paged_Reader = new Paged_Reader(offset + n)
   454       def close { stream.close }
   455     }
   456     new Paged_Reader(0)
   457   }
   458 }