001 /* StreamTokenizer.java -- parses streams of characters into tokens
002 Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003 Free Software Foundation
003
004 This file is part of GNU Classpath.
005
006 GNU Classpath is free software; you can redistribute it and/or modify
007 it under the terms of the GNU General Public License as published by
008 the Free Software Foundation; either version 2, or (at your option)
009 any later version.
010
011 GNU Classpath is distributed in the hope that it will be useful, but
012 WITHOUT ANY WARRANTY; without even the implied warranty of
013 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
014 General Public License for more details.
015
016 You should have received a copy of the GNU General Public License
017 along with GNU Classpath; see the file COPYING. If not, write to the
018 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
019 02110-1301 USA.
020
021 Linking this library statically or dynamically with other modules is
022 making a combined work based on this library. Thus, the terms and
023 conditions of the GNU General Public License cover the whole
024 combination.
025
026 As a special exception, the copyright holders of this library give you
027 permission to link this library with independent modules to produce an
028 executable, regardless of the license terms of these independent
029 modules, and to copy and distribute the resulting executable under
030 terms of your choice, provided that you also meet, for each linked
031 independent module, the terms and conditions of the license of that
032 module. An independent module is a module which is not derived from
033 or based on this library. If you modify this library, you may extend
034 this exception to your version of the library, but you are not
035 obligated to do so. If you do not wish to do so, delete this
036 exception statement from your version. */
037
038 package java.io;
039
040 /**
041 * This class parses streams of characters into tokens. There are a
042 * million-zillion flags that can be set to control the parsing, as
043 * described under the various method headings.
044 *
045 * @author Warren Levy (warrenl@cygnus.com)
046 * @date October 25, 1998.
047 */
048 /* Written using "Java Class Libraries", 2nd edition, ISBN 0-201-31002-3
049 * "The Java Language Specification", ISBN 0-201-63451-1
050 * plus online API docs for JDK 1.2 beta from http://www.javasoft.com.
051 * Status: Believed complete and correct.
052 */
053
054 public class StreamTokenizer
055 {
056 /** A constant indicating that the end of the stream has been read. */
057 public static final int TT_EOF = -1;
058
059 /** A constant indicating that the end of the line has been read. */
060 public static final int TT_EOL = '\n';
061
062 /** A constant indicating that a number token has been read. */
063 public static final int TT_NUMBER = -2;
064
065 /** A constant indicating that a word token has been read. */
066 public static final int TT_WORD = -3;
067
068 /** A constant indicating that no tokens have been read yet. */
069 private static final int TT_NONE = -4;
070
071 /**
072 * Contains the type of the token read resulting from a call to nextToken
073 * The rules are as follows:
074 * <ul>
075 * <li>For a token consisting of a single ordinary character, this is the
076 * value of that character.</li>
077 * <li>For a quoted string, this is the value of the quote character</li>
078 * <li>For a word, this is TT_WORD</li>
079 * <li>For a number, this is TT_NUMBER</li>
080 * <li>For the end of the line, this is TT_EOL</li>
081 * <li>For the end of the stream, this is TT_EOF</li>
082 * </ul>
083 */
084 public int ttype = TT_NONE;
085
086 /** The String associated with word and string tokens. */
087 public String sval;
088
089 /** The numeric value associated with number tokens. */
090 public double nval;
091
092 /* Indicates whether end-of-line is recognized as a token. */
093 private boolean eolSignificant = false;
094
095 /* Indicates whether word tokens are automatically made lower case. */
096 private boolean lowerCase = false;
097
098 /* Indicates whether C++ style comments are recognized and skipped. */
099 private boolean slashSlash = false;
100
101 /* Indicates whether C style comments are recognized and skipped. */
102 private boolean slashStar = false;
103
104 /* Attribute tables of each byte from 0x00 to 0xFF. */
105 private boolean[] whitespace = new boolean[256];
106 private boolean[] alphabetic = new boolean[256];
107 private boolean[] numeric = new boolean[256];
108 private boolean[] quote = new boolean[256];
109 private boolean[] comment = new boolean[256];
110
111 /* The Reader associated with this class. */
112 private PushbackReader in;
113
114 /* Indicates if a token has been pushed back. */
115 private boolean pushedBack = false;
116
117 /* Contains the current line number of the reader. */
118 private int lineNumber = 1;
119
120 /**
121 * This method reads bytes from an <code>InputStream</code> and tokenizes
122 * them. For details on how this method operates by default, see
123 * <code>StreamTokenizer(Reader)</code>.
124 *
125 * @param is The <code>InputStream</code> to read from
126 *
127 * @deprecated Since JDK 1.1.
128 */
129 public StreamTokenizer(InputStream is)
130 {
131 this(new InputStreamReader(is));
132 }
133
134 /**
135 * This method initializes a new <code>StreamTokenizer</code> to read
136 * characters from a <code>Reader</code> and parse them. The char values
137 * have their hight bits masked so that the value is treated a character
138 * in the range of 0x0000 to 0x00FF.
139 * <p>
140 * This constructor sets up the parsing table to parse the stream in the
141 * following manner:
142 * <ul>
143 * <li>The values 'A' through 'Z', 'a' through 'z' and 0xA0 through 0xFF
144 * are initialized as alphabetic</li>
145 * <li>The values 0x00 through 0x20 are initialized as whitespace</li>
146 * <li>The values '\'' and '"' are initialized as quote characters</li>
147 * <li>'/' is a comment character</li>
148 * <li>Numbers will be parsed</li>
149 * <li>EOL is not treated as significant</li>
150 * <li>C and C++ (//) comments are not recognized</li>
151 * </ul>
152 *
153 * @param r The <code>Reader</code> to read chars from
154 */
155 public StreamTokenizer(Reader r)
156 {
157 in = new PushbackReader(r);
158
159 whitespaceChars(0x00, 0x20);
160 wordChars('A', 'Z');
161 wordChars('a', 'z');
162 wordChars(0xA0, 0xFF);
163 commentChar('/');
164 quoteChar('\'');
165 quoteChar('"');
166 parseNumbers();
167 }
168
169 /**
170 * This method sets the comment attribute on the specified
171 * character. Other attributes for the character are cleared.
172 *
173 * @param ch The character to set the comment attribute for, passed as an int
174 */
175 public void commentChar(int ch)
176 {
177 if (ch >= 0 && ch <= 255)
178 {
179 comment[ch] = true;
180 whitespace[ch] = false;
181 alphabetic[ch] = false;
182 numeric[ch] = false;
183 quote[ch] = false;
184 }
185 }
186
187 /**
188 * This method sets a flag that indicates whether or not the end of line
189 * sequence terminates and is a token. The defaults to <code>false</code>
190 *
191 * @param flag <code>true</code> if EOF is significant, <code>false</code>
192 * otherwise
193 */
194 public void eolIsSignificant(boolean flag)
195 {
196 eolSignificant = flag;
197 }
198
199 /**
200 * This method returns the current line number. Note that if the
201 * <code>pushBack()</code> method is called, it has no effect on the
202 * line number returned by this method.
203 *
204 * @return The current line number
205 */
206 public int lineno()
207 {
208 return lineNumber;
209 }
210
211 /**
212 * This method sets a flag that indicates whether or not alphabetic
213 * tokens that are returned should be converted to lower case.
214 *
215 * @param flag <code>true</code> to convert to lower case,
216 * <code>false</code> otherwise
217 */
218 public void lowerCaseMode(boolean flag)
219 {
220 lowerCase = flag;
221 }
222
223 private boolean isWhitespace(int ch)
224 {
225 return (ch >= 0 && ch <= 255 && whitespace[ch]);
226 }
227
228 private boolean isAlphabetic(int ch)
229 {
230 return ((ch > 255) || (ch >= 0 && alphabetic[ch]));
231 }
232
233 private boolean isNumeric(int ch)
234 {
235 return (ch >= 0 && ch <= 255 && numeric[ch]);
236 }
237
238 private boolean isQuote(int ch)
239 {
240 return (ch >= 0 && ch <= 255 && quote[ch]);
241 }
242
243 private boolean isComment(int ch)
244 {
245 return (ch >= 0 && ch <= 255 && comment[ch]);
246 }
247
248 /**
249 * This method reads the next token from the stream. It sets the
250 * <code>ttype</code> variable to the appropriate token type and
251 * returns it. It also can set <code>sval</code> or <code>nval</code>
252 * as described below. The parsing strategy is as follows:
253 * <ul>
254 * <li>Skip any whitespace characters.</li>
255 * <li>If a numeric character is encountered, attempt to parse a numeric
256 * value. Leading '-' characters indicate a numeric only if followed by
257 * another non-'-' numeric. The value of the numeric token is terminated
258 * by either the first non-numeric encountered, or the second occurrence of
259 * '-' or '.'. The token type returned is TT_NUMBER and <code>nval</code>
260 * is set to the value parsed.</li>
261 * <li>If an alphabetic character is parsed, all subsequent characters
262 * are read until the first non-alphabetic or non-numeric character is
263 * encountered. The token type returned is TT_WORD and the value parsed
264 * is stored in <code>sval</code>. If lower case mode is set, the token
265 * stored in <code>sval</code> is converted to lower case. The end of line
266 * sequence terminates a word only if EOL signficance has been turned on.
267 * The start of a comment also terminates a word. Any character with a
268 * non-alphabetic and non-numeric attribute (such as white space, a quote,
269 * or a commet) are treated as non-alphabetic and terminate the word.</li>
270 * <li>If a comment character is parsed, then all remaining characters on
271 * the current line are skipped and another token is parsed. Any EOL or
272 * EOF's encountered are not discarded, but rather terminate the comment.</li>
273 * <li>If a quote character is parsed, then all characters up to the
274 * second occurrence of the same quote character are parsed into a
275 * <code>String</code>. This <code>String</code> is stored as
276 * <code>sval</code>, but is not converted to lower case, even if lower case
277 * mode is enabled. The token type returned is the value of the quote
278 * character encountered. Any escape sequences
279 * (\b (backspace), \t (HTAB), \n (linefeed), \f (form feed), \r
280 * (carriage return), \" (double quote), \' (single quote), \\
281 * (backslash), \XXX (octal esacpe)) are converted to the appropriate
282 * char values. Invalid esacape sequences are left in untranslated.
283 * Unicode characters like ('\ u0000') are not recognized. </li>
284 * <li>If the C++ comment sequence "//" is encountered, and the parser
285 * is configured to handle that sequence, then the remainder of the line
286 * is skipped and another token is read exactly as if a character with
287 * the comment attribute was encountered.</li>
288 * <li>If the C comment sequence "/*" is encountered, and the parser
289 * is configured to handle that sequence, then all characters up to and
290 * including the comment terminator sequence are discarded and another
291 * token is parsed.</li>
292 * <li>If all cases above are not met, then the character is an ordinary
293 * character that is parsed as a token by itself. The char encountered
294 * is returned as the token type.</li>
295 * </ul>
296 *
297 * @return The token type
298 * @exception IOException If an I/O error occurs
299 */
300 public int nextToken() throws IOException
301 {
302 if (pushedBack)
303 {
304 pushedBack = false;
305 if (ttype != TT_NONE)
306 return ttype;
307 }
308
309 sval = null;
310 int ch;
311
312 // Skip whitespace. Deal with EOL along the way.
313 while (isWhitespace(ch = in.read()))
314 if (ch == '\n' || ch == '\r')
315 {
316 lineNumber++;
317
318 // Throw away \n if in combination with \r.
319 if (ch == '\r' && (ch = in.read()) != '\n')
320 {
321 if (ch != TT_EOF)
322 in.unread(ch);
323 }
324 if (eolSignificant)
325 return (ttype = TT_EOL);
326 }
327
328 if (ch == '/')
329 if ((ch = in.read()) == '/' && slashSlash)
330 {
331 while ((ch = in.read()) != '\n' && ch != '\r' && ch != TT_EOF)
332 ;
333
334 if (ch != TT_EOF)
335 in.unread(ch);
336 return nextToken(); // Recursive, but not too deep in normal cases
337 }
338 else if (ch == '*' && slashStar)
339 {
340 while (true)
341 {
342 ch = in.read();
343 if (ch == '*')
344 {
345 if ((ch = in.read()) == '/')
346 break;
347 else if (ch != TT_EOF)
348 in.unread(ch);
349 }
350 else if (ch == '\n' || ch == '\r')
351 {
352 lineNumber++;
353 if (ch == '\r' && (ch = in.read()) != '\n')
354 {
355 if (ch != TT_EOF)
356 in.unread(ch);
357 }
358 }
359 else if (ch == TT_EOF)
360 {
361 break;
362 }
363 }
364 return nextToken(); // Recursive, but not too deep in normal cases
365 }
366 else
367 {
368 if (ch != TT_EOF)
369 in.unread(ch);
370 ch = '/';
371 }
372
373 if (ch == TT_EOF)
374 ttype = TT_EOF;
375 else if (isNumeric(ch))
376 {
377 boolean isNegative = false;
378 if (ch == '-')
379 {
380 // Read ahead to see if this is an ordinary '-' rather than numeric.
381 ch = in.read();
382 if (isNumeric(ch) && ch != '-')
383 {
384 isNegative = true;
385 }
386 else
387 {
388 if (ch != TT_EOF)
389 in.unread(ch);
390 return (ttype = '-');
391 }
392 }
393
394 StringBuffer tokbuf = new StringBuffer();
395 tokbuf.append((char) ch);
396
397 int decCount = 0;
398 while (isNumeric(ch = in.read()) && ch != '-')
399 if (ch == '.' && decCount++ > 0)
400 break;
401 else
402 tokbuf.append((char) ch);
403
404 if (ch != TT_EOF)
405 in.unread(ch);
406 ttype = TT_NUMBER;
407 try
408 {
409 nval = Double.valueOf(tokbuf.toString()).doubleValue();
410 }
411 catch (NumberFormatException _)
412 {
413 nval = 0.0;
414 }
415 if (isNegative)
416 nval = -nval;
417 }
418 else if (isAlphabetic(ch))
419 {
420 StringBuffer tokbuf = new StringBuffer();
421 tokbuf.append((char) ch);
422 while (isAlphabetic(ch = in.read()) || isNumeric(ch))
423 tokbuf.append((char) ch);
424 if (ch != TT_EOF)
425 in.unread(ch);
426 ttype = TT_WORD;
427 sval = tokbuf.toString();
428 if (lowerCase)
429 sval = sval.toLowerCase();
430 }
431 else if (isComment(ch))
432 {
433 while ((ch = in.read()) != '\n' && ch != '\r' && ch != TT_EOF)
434 ;
435
436 if (ch != TT_EOF)
437 in.unread(ch);
438 return nextToken(); // Recursive, but not too deep in normal cases.
439 }
440 else if (isQuote(ch))
441 {
442 ttype = ch;
443 StringBuffer tokbuf = new StringBuffer();
444 while ((ch = in.read()) != ttype && ch != '\n' && ch != '\r' &&
445 ch != TT_EOF)
446 {
447 if (ch == '\\')
448 switch (ch = in.read())
449 {
450 case 'a': ch = 0x7;
451 break;
452 case 'b': ch = '\b';
453 break;
454 case 'f': ch = 0xC;
455 break;
456 case 'n': ch = '\n';
457 break;
458 case 'r': ch = '\r';
459 break;
460 case 't': ch = '\t';
461 break;
462 case 'v': ch = 0xB;
463 break;
464 case '\n': ch = '\n';
465 break;
466 case '\r': ch = '\r';
467 break;
468 case '\"':
469 case '\'':
470 case '\\':
471 break;
472 default:
473 int ch1, nextch;
474 if ((nextch = ch1 = ch) >= '0' && ch <= '7')
475 {
476 ch -= '0';
477 if ((nextch = in.read()) >= '0' && nextch <= '7')
478 {
479 ch = ch * 8 + nextch - '0';
480 if ((nextch = in.read()) >= '0' && nextch <= '7' &&
481 ch1 >= '0' && ch1 <= '3')
482 {
483 ch = ch * 8 + nextch - '0';
484 nextch = in.read();
485 }
486 }
487 }
488
489 if (nextch != TT_EOF)
490 in.unread(nextch);
491 }
492
493 tokbuf.append((char) ch);
494 }
495
496 // Throw away matching quote char.
497 if (ch != ttype && ch != TT_EOF)
498 in.unread(ch);
499
500 sval = tokbuf.toString();
501 }
502 else
503 {
504 ttype = ch;
505 }
506
507 return ttype;
508 }
509
510 private void resetChar(int ch)
511 {
512 whitespace[ch] = alphabetic[ch] = numeric[ch] = quote[ch] = comment[ch] =
513 false;
514 }
515
516 /**
517 * This method makes the specified character an ordinary character. This
518 * means that none of the attributes (whitespace, alphabetic, numeric,
519 * quote, or comment) will be set on this character. This character will
520 * parse as its own token.
521 *
522 * @param ch The character to make ordinary, passed as an int
523 */
524 public void ordinaryChar(int ch)
525 {
526 if (ch >= 0 && ch <= 255)
527 resetChar(ch);
528 }
529
530 /**
531 * This method makes all the characters in the specified range, range
532 * terminators included, ordinary. This means the none of the attributes
533 * (whitespace, alphabetic, numeric, quote, or comment) will be set on
534 * any of the characters in the range. This makes each character in this
535 * range parse as its own token.
536 *
537 * @param low The low end of the range of values to set the whitespace
538 * attribute for
539 * @param hi The high end of the range of values to set the whitespace
540 * attribute for
541 */
542 public void ordinaryChars(int low, int hi)
543 {
544 if (low < 0)
545 low = 0;
546 if (hi > 255)
547 hi = 255;
548 for (int i = low; i <= hi; i++)
549 resetChar(i);
550 }
551
552 /**
553 * This method sets the numeric attribute on the characters '0' - '9' and
554 * the characters '.' and '-'.
555 * When this method is used, the result of giving other attributes
556 * (whitespace, quote, or comment) to the numeric characters may
557 * vary depending on the implementation. For example, if
558 * parseNumbers() and then whitespaceChars('1', '1') are called,
559 * this implementation reads "121" as 2, while some other implementation
560 * will read it as 21.
561 */
562 public void parseNumbers()
563 {
564 for (int i = 0; i <= 9; i++)
565 numeric['0' + i] = true;
566
567 numeric['.'] = true;
568 numeric['-'] = true;
569 }
570
571 /**
572 * Puts the current token back into the StreamTokenizer so
573 * <code>nextToken</code> will return the same value on the next call.
574 * May cause the lineno method to return an incorrect value
575 * if lineno is called before the next call to nextToken.
576 */
577 public void pushBack()
578 {
579 pushedBack = true;
580 }
581
582 /**
583 * This method sets the quote attribute on the specified character.
584 * Other attributes for the character are cleared.
585 *
586 * @param ch The character to set the quote attribute for, passed as an int.
587 */
588 public void quoteChar(int ch)
589 {
590 if (ch >= 0 && ch <= 255)
591 {
592 quote[ch] = true;
593 comment[ch] = false;
594 whitespace[ch] = false;
595 alphabetic[ch] = false;
596 numeric[ch] = false;
597 }
598 }
599
600 /**
601 * This method removes all attributes (whitespace, alphabetic, numeric,
602 * quote, and comment) from all characters. It is equivalent to calling
603 * <code>ordinaryChars(0x00, 0xFF)</code>.
604 *
605 * @see #ordinaryChars(int, int)
606 */
607 public void resetSyntax()
608 {
609 ordinaryChars(0x00, 0xFF);
610 }
611
612 /**
613 * This method sets a flag that indicates whether or not "C++" language style
614 * comments ("//" comments through EOL ) are handled by the parser.
615 * If this is <code>true</code> commented out sequences are skipped and
616 * ignored by the parser. This defaults to <code>false</code>.
617 *
618 * @param flag <code>true</code> to recognized and handle "C++" style
619 * comments, <code>false</code> otherwise
620 */
621 public void slashSlashComments(boolean flag)
622 {
623 slashSlash = flag;
624 }
625
626 /**
627 * This method sets a flag that indicates whether or not "C" language style
628 * comments (with nesting not allowed) are handled by the parser.
629 * If this is <code>true</code> commented out sequences are skipped and
630 * ignored by the parser. This defaults to <code>false</code>.
631 *
632 * @param flag <code>true</code> to recognized and handle "C" style comments,
633 * <code>false</code> otherwise
634 */
635 public void slashStarComments(boolean flag)
636 {
637 slashStar = flag;
638 }
639
640 /**
641 * This method returns the current token value as a <code>String</code> in
642 * the form "Token[x], line n", where 'n' is the current line numbers and
643 * 'x' is determined as follows.
644 * <p>
645 * <ul>
646 * <li>If no token has been read, then 'x' is "NOTHING" and 'n' is 0</li>
647 * <li>If <code>ttype</code> is TT_EOF, then 'x' is "EOF"</li>
648 * <li>If <code>ttype</code> is TT_EOL, then 'x' is "EOL"</li>
649 * <li>If <code>ttype</code> is TT_WORD, then 'x' is <code>sval</code></li>
650 * <li>If <code>ttype</code> is TT_NUMBER, then 'x' is "n=strnval" where
651 * 'strnval' is <code>String.valueOf(nval)</code>.</li>
652 * <li>If <code>ttype</code> is a quote character, then 'x' is
653 * <code>sval</code></li>
654 * <li>For all other cases, 'x' is <code>ttype</code></li>
655 * </ul>
656 */
657 public String toString()
658 {
659 String tempstr;
660 if (ttype == TT_EOF)
661 tempstr = "EOF";
662 else if (ttype == TT_EOL)
663 tempstr = "EOL";
664 else if (ttype == TT_WORD)
665 tempstr = sval;
666 else if (ttype == TT_NUMBER)
667 tempstr = "n=" + nval;
668 else if (ttype == TT_NONE)
669 tempstr = "NOTHING";
670 else // must be an ordinary char.
671 tempstr = "\'" + (char) ttype + "\'";
672
673 return "Token[" + tempstr + "], line " + lineno();
674 }
675
676 /**
677 * This method sets the whitespace attribute for all characters in the
678 * specified range, range terminators included.
679 *
680 * @param low The low end of the range of values to set the whitespace
681 * attribute for
682 * @param hi The high end of the range of values to set the whitespace
683 * attribute for
684 */
685 public void whitespaceChars(int low, int hi)
686 {
687 if (low < 0)
688 low = 0;
689 if (hi > 255)
690 hi = 255;
691 for (int i = low; i <= hi; i++)
692 {
693 resetChar(i);
694 whitespace[i] = true;
695 }
696 }
697
698 /**
699 * This method sets the alphabetic attribute for all characters in the
700 * specified range, range terminators included.
701 *
702 * @param low The low end of the range of values to set the alphabetic
703 * attribute for
704 * @param hi The high end of the range of values to set the alphabetic
705 * attribute for
706 */
707 public void wordChars(int low, int hi)
708 {
709 if (low < 0)
710 low = 0;
711 if (hi > 255)
712 hi = 255;
713 for (int i = low; i <= hi; i++)
714 alphabetic[i] = true;
715 }
716 }