View Javadoc
1   
2   
3   package eu.simuline.util.sgml;
4   
5   import eu.simuline.util.ListMap;
6   
7   import java.io.Reader;
8   import java.io.IOException;
9   
10  import org.xml.sax.ContentHandler;
11  import org.xml.sax.Locator;
12  import org.xml.sax.Attributes;
13  import org.xml.sax.InputSource;
14  import org.xml.sax.SAXException;
15  import org.xml.sax.SAXParseException;
16  import java.util.Locale;
17  
18  /**
19   * A rudimentary <code>SGML</code> parser with something like a SAX-api. 
20   *
21   * @author <a href="mailto:ernst.reissner@simuline.eu">Ernst Reissner</a>
22   * @version 1.0
23   */
24  public final class SGMLParser {
25  
26      private static final String QUOTE_DOT = "\". ";
27      private static final char   SYMB_EQ   = '=';
28      private static final char   SYMB_COMMENT = '-';
29      private static final char   SYMB_TAG = '<';
30  
31      /* --------------------------------------------------------------------- *
32       * inner classes                                                         *
33       * --------------------------------------------------------------------- */
34  
35      /**
36       * A <code>ContentHandler</code> which simply ignores all events.  
37       * May be used for debugging. 
38       */
39      static class TrivialContentHandler implements ContentHandler {
40  
41  	/** <!-- api-docs inherited from interface implemented.  -->*/ 
42  	public void setDocumentLocator(Locator locator) {
43  	    // is empty. 
44  	}
45  
46  	public void startDocument() throws SAXException {
47  	    // is empty. 
48  	}
49  
50  	public void endDocument() throws SAXException {
51  	    // is empty. 
52  	}
53  
54  	public void startPrefixMapping(String prefix,
55  				       String uri)
56  	    throws SAXException {
57  	    // is empty. 
58  	}
59  
60  	public void endPrefixMapping(String prefix)
61  	    throws SAXException {
62  	    // is empty. 
63  	}
64  
65  	public void startElement(String namespaceURI,
66  				 String localName,
67  				 String qName,
68  				 Attributes atts)
69  	    throws SAXException {
70  	    // is empty. 
71  	}
72  
73  	public void endElement(String namespaceURI,
74  			       String localName,
75  			       String qName)
76  	    throws SAXException {
77  	    // is empty. 
78  	}
79  
80  	public void characters(char[] chr,
81  			       int start,
82  			       int length)
83  	    throws SAXException {
84  	    // is empty. 
85  	}
86  
87  	public void ignorableWhitespace(char[] chr,
88  					int start,
89  					int length)
90  	    throws SAXException {
91  	    // is empty. 
92  	}
93  
94  	public void processingInstruction(String target,
95  					  String data)
96  	    throws SAXException {
97  	    // is empty. 
98  	}
99  
100 	public void skippedEntity(String name)
101 	    throws SAXException {
102 	    // is empty. 
103 	}
104     } // class TrivialContentHandler 
105 
106 
107     /**
108      * An **** partial **** implementation 
109      * of the SAX-interface <code>Attributes</code> 
110      * which allows to set name-value-pairs by method {@link #addAttribute}. 
111      */
112     class AttributesWrapper {
113 
114 	/* ----------------------------------------------------------------- *
115 	 * fields                                                            *
116 	 * ----------------------------------------------------------------- */
117 
118 	/**
119 	 * See {@link AttributesImpl#name2value}. 
120 	 */
121 	private final ListMap<String, String> name2value;
122 
123 	/* ----------------------------------------------------------------- *
124 	 * constructors                                                      *
125 	 * ----------------------------------------------------------------- */
126 
127 	/**
128 	 * Creates a new empty <code>AttributesWrapper</code> 
129 	 * which represents an empty attribute list. 
130 	 */
131 	AttributesWrapper() {
132 	    this.name2value = new ListMap<String, String>();
133 	}
134 
135 	/* ----------------------------------------------------------------- *
136 	 * methods                                                           *
137 	 * ----------------------------------------------------------------- */
138 
139 
140 	/**
141 	 * Adds an attribute with the given name and value. 
142 	 *
143 	 * @param attName 
144 	 *     the <code>String</code> representation 
145 	 *     of the name of an attribute. 
146 	 * @param attValue 
147 	 *     the value of an attribute as a <code>String</code>. 
148 	 *     If no value is provided, 
149 	 *     this is {@link AttributesImpl#NO_VALUE}. 
150 	 */
151 	void addAttribute(String attName, String attValue) {
152 	    String oldAttValue = this.name2value
153 		.put(attName, attValue);
154 	    if (oldAttValue != null) {
155 		// Here, the attribute has occured before. 
156 		SGMLParser.this.parseExceptionHandler
157 		    .foundMultipleAttribute(attName,
158 					    oldAttValue);
159 	    }
160  	}
161 
162 	Attributes getAttributes() {
163 	    return new AttributesImpl(this.name2value);
164 	}
165     } // class AttributesWrapper 
166 
167 
168     /**
169      * Provides a single method which decides whether the given character 
170      * passes a certain test. 
171      */
172     interface CharTester {
173 
174 	/**
175 	 * Returns whether the given character <code>ch</code> 
176 	 * passes the test given by this <code>CharTester</code>. 
177 	 *
178 	 * @param chr 
179 	 *    an <code>int</code> value representing a character. 
180 	 * @return the <code>boolean</code> 
181 	 *    signifying whether the given character <code>ch</code> 
182 	 *    passes the test given by this <code>CharTester</code>. 
183 	 */
184 	boolean testChar(char chr);
185 
186     } // interface CharTester 
187 
188     /**
189      * Tests for blank, <code>/</code>, <code>&gt;</code>. 
190      */
191     private static final CharTester TEST_BLANK_GT_SLASH = new CharTester() {
192 	public boolean testChar(char chr) {
193 	    return Character.isWhitespace(chr) 
194 		|| chr == '/' 
195 		|| chr == '>';
196 	}
197     };
198 
199     /**
200      * Tests for blank or <code>&gt;</code>. 
201      */
202     private static final CharTester TEST_BLANK_GT = new CharTester() {
203 	public boolean testChar(char chr) {
204 	    return Character.isWhitespace(chr) 
205 		|| chr == '>';
206 	}
207     };
208 
209     /*
210      * Tests for <code>/</code> or <code>&gt;</code>. 
211      */
212 /*
213     private static final CharTester TEST_GT_SLASH = new CharTester() {
214 	    public boolean testChar(char ch) {
215 		return ch == '/' 
216 		    || ch == '>';
217 	    }
218 	};
219 */
220 
221     /**
222      * Tests for <code>&lt;</code>. 
223      */
224     private static final CharTester TEST_LT = new CharTester() {
225 	    public boolean testChar(char chr) {
226 		return chr == '<';
227 	    }
228 	};
229 
230     /**
231      * Tests for <code>&gt;</code>. 
232      */
233     private static final CharTester TEST_GT = new CharTester() {
234 	    public boolean testChar(char chr) {
235 		return chr == '>';
236 	    }
237 	};
238 
239     /**
240      * Tests for <code>=</code> and for <code>&gt;</code>. 
241      */
242     private static final CharTester TEST_BLANK_EQUALS_GT = new CharTester() {
243 	    public boolean testChar(char chr) {
244 		return Character.isWhitespace(chr) 
245 		    || chr == '='
246 		    || chr == '>';
247 	    }
248 	};
249 
250     /**
251      * Tests for whitespace. 
252      */
253     private static final CharTester TEST_NO_WHITESPACE = new CharTester() {
254 	    public boolean testChar(char chr) {
255 		return !Character.isWhitespace(chr);
256 	    }
257 	};
258 
259     /**
260      * Tests for quote both for<code>'</code> and for <code>"</code>. 
261      */
262 /*
263     private static final CharTester TEST_QUOTE = new CharTester() {
264 	    public boolean testChar(char chr) {
265 		return chr == '\'' 
266 		    || chr == '"';
267 	    }
268 	};
269 */
270 
271     /*
272      * Tests for end of comment <code>--></code>. 
273      * This tests for a sequence of characters 
274      * and confirms after having read the last one. 
275      */
276     private static final CharTester TEST_END_OF_COMMENT = new CharTester() {
277 
278 	    /**
279 	     * Contains the sequence <code>--></code> 
280 	     * representing the end of a comment. 
281 	     */
282 	    static final String END_OF_COMMENT = "-->";
283 
284 	    /**
285 	     * Contains the index in {@link #END_OF_COMMENT} 
286 	     * which is to be compared next by {@link #testChar}. 
287 	     */
288 	    private int index = 0;
289 
290 	    /**
291 	     * Returns whether the last characters tested 
292 	     * are <code>--></code>. 
293 	     *
294 	     * @param chr 
295 	     *    a <code>char</code>. 
296 	     * @return
297 	     *    whether the last characters tested 
298 	     *    including <code>char</code> are <code>--></code>. 
299 	     *    In particular, if less than three characters are read 
300 	     *    this is <code>false</code>. 
301 	     */
302 	    public boolean testChar(char chr) {		
303 		if (END_OF_COMMENT.charAt(index++) == chr) {
304 		    if (this.index == END_OF_COMMENT.length() - 1) {
305 			this.index = 0;
306 			return true;
307 		    } else {
308 			return false;
309 		    }
310 		} else {
311 		    this.index = 0;
312 		    return false;
313 		}
314 	    }
315 	}; // TEST_END_OF_COMMENT 
316 
317     /**
318      * A <code>CharTester</code> which allows to specify 
319      * the character which passes the test. 
320      */
321     static class SpecCharTester implements CharTester {
322 
323 	/**
324 	 * The character which passes the test {@link #testChar}. 
325 	 */
326 	private char chr;
327 
328 	/**
329 	 * Sets {@link #chr} to the specified character value. 
330 	 *
331 	 * @param chr 
332 	 *    a <code>char</code> value. 
333 	 */
334 	void setChar(char chr) {
335 	    this.chr = chr;
336 	}
337 
338 	/**
339 	 * Returns whether the given character coincides with {@link #chr}. 
340 	 *
341 	 * @param chr 
342 	 *    a <code>char</code> value. 
343 	 * @return 
344 	 *    whether <code>ch</code> coincides with {@link #chr}. 
345 	 */
346 	public boolean testChar(char chr) {
347 	    return chr == this.chr;
348 	}
349     } // SpecCharTester 
350 
351     /**
352      * Tests for a specified character. 
353      * This is used for quotes which allow the cases 
354      * <code>'</code> and <code>"</code>. 
355      *
356      * @see XMLsGMLspecifica#parseAttribute
357      */
358     private static final SpecCharTester TEST_SPEC = new SpecCharTester();
359 
360     /**
361      * Class which buffers the read stream. 
362      */
363     static class Buffer {
364 
365 	/* ----------------------------------------------------------------- *
366 	 * fields                                                            *
367 	 * ----------------------------------------------------------------- */
368 
369 	/**
370 	 * The reader buffered. 
371 	 */
372 	private final Reader reader;
373 
374 	/**
375 	 * The current buffer. 
376 	 * The current parts to be read start with 
377 	 * <code>bufferArray[{@link #start}]</code> and end with 
378 	 * <code>bufferArray[{@link #end}]</code>, exclusively. 
379 	 */
380 	private final char[] bufferArray;
381 
382 	/**
383 	 * The first index in {@link #bufferArray} 
384 	 * read in from {@link #reader} but not returned 
385 	 * by {@link #readArray} or {@link #readChar}. 
386 	 */
387 	private int start;
388 
389 	/**
390 	 * Set by {@link #readArray} and read by {@link #getStartAndMove}. 
391 	 * When invoking {@link #readArray} <code>newStart</code> 
392 	 * is set to {@link #start} and increased 
393 	 * by the number of read charactersincreases. 
394 	 * Then {@link #getStartAndMove} updates {@link #start} 
395 	 * according to <code>newStart</code>. 
396 	 */
397 	private int newStart;
398 
399 	/**
400 	 * The first index in {@link #bufferArray} not read 
401 	 * from {@link #reader} 
402 	 * or <code>-1</code> if the end of the stream is reached. 
403 	 * This means that <code>bufferArray[end]</code> 
404 	 * either does not exist or at least is not significant. 
405 	 */
406 	private int end;
407 
408 	/* ----------------------------------------------------------------- *
409 	 * constructors                                                      *
410 	 * ----------------------------------------------------------------- */
411 
412 	/**
413 	 * Creates a new <code>Buffer</code> from the given reader 
414 	 * with the given size. 
415 	 *
416 	 * @param reader 
417 	 *    the <code>Reader</code> to be buffered. 
418 	 * @param length 
419 	 *    the length of the buffer. 
420 	 * @exception IOException 
421 	 *    if an error occurs 
422 	 */
423 	Buffer(Reader reader, int length) throws IOException {
424 	    this.reader = reader;
425 	    this.bufferArray = new char[length];
426 	    this.start = 0;
427 	    this.end = this.start; // signifies: reading necessary. 
428 	}
429 
430 	/* ----------------------------------------------------------------- *
431 	 * methods                                                           *
432 	 * ----------------------------------------------------------------- */
433 
434 
435 	/**
436 	 * Returns whether this buffer is currently empty. 
437 	 * When this is the case and someone tries to read further characters 
438 	 * this will lead to a trial 
439 	 * to read further pieces from {@link #reader}. 
440 	 *
441 	 * @return a <code>boolean</code> value 
442 	 *    signifying whether this buffer is currently empty. 
443 	 */
444 	boolean isEmpty() {
445 	    return this.end == this.start;
446 	}
447 
448 	/**
449 	 * Returns whether the end of the stream is reached. 
450 	 *
451 	 * @return 
452 	 *    a <code>boolean</code> specifying 
453 	 *    whether the end of the stream is reached. 
454 	 */
455 	boolean reachedEOS() {
456 	    return this.end == -1;
457 	}
458 
459 	/**
460 	 * Reads a single <code>char</code> and returns it. 
461 	 *
462 	 * @return 
463 	 *    an <code>int</code> value 
464 	 *    which is either the next <code>char</code> read in 
465 	 *    or <code>-1</code> which signifies the end of the stream. 
466 	 * @exception IOException 
467 	 *    if an error occurs
468 	 */
469 	int readChar() throws IOException {
470 	    if (reachedEOS()) {
471 		return -1;
472 	    }
473 	    if (isEmpty()) {
474 		this.start = 0;
475 		this.end = this.reader.read(this.bufferArray);
476 		if (reachedEOS()) {
477 		    return -1;
478 		}
479 	    }
480 	    return this.bufferArray[this.start++];
481 	}
482 
483 	/**
484 	 * Reads an array from {@link #reader}. 
485 	 * As a side effect, writes the field {@link #newStart}. 
486 	 * Also, if the portion of {@link #bufferArray} 
487 	 * to be read, i.e. between {@link #start} and {@link #end}, 
488 	 * is empty, a new portion is buffered. 
489 	 *
490 	 * @param charTester 
491 	 *    a <code>CharTester</code> which signifies 
492 	 *    when to end reading from the buffer. 
493 	 * @return 
494 	 *    an <code>int</code> signifying the number of <code>char</code>s 
495 	 *    read or <code>-1</code> which signifies the end of the stream. 
496 	 *    It is read to the next &lt; or, if there is none, 
497 	 *    to the end of the stream. 
498 	 *    Thus there is a difference between the return values 
499 	 *    <code>-1</code> and <code>0</code>. 
500 	 * @exception IOException 
501 	 *    if an error occurs
502 	 */
503 	int readArray(CharTester charTester) throws IOException {
504 	    if (reachedEOS()) {
505 		return -1;
506 	    }
507 	    if (isEmpty()) {
508 		this.start = 0;
509 		this.end = this.reader.read(this.bufferArray);
510 //System.out.println("read: "+this.end);
511 		if (reachedEOS()) {
512 		    return -1;
513 		}
514 	    }
515 
516 	    for (int i = this.start; i < this.end; i++) {
517 		if (charTester.testChar(this.bufferArray[i])) {
518 		    // found match described by charTester 
519 		    this.newStart = i;
520 		    return this.newStart - this.start;
521 		}
522 	    }
523             // Here, the test always failed. 
524 	    this.newStart = this.end;
525 	    return this.end - this.start;
526 	}
527 
528 	/**
529 	 * Describe <code>readStringBuffer</code> method here.
530 	 *
531 	 * @param charTester 
532 	 *    a <code>CharTester</code> which determines 
533 	 *    the first character not read 
534 	 *    into the resulting <code>StringBuffer</code>. 
535 	 * @param elementName
536 	 *    a <code>String</code> which determines 
537 	 *    the element under consideration. 
538 	 *    This is only used for generating the message of a 
539 	 *    <code>SAXParseException</code>. 
540 	 *    <p>
541 	 *    Allowed values: {@link #START_TAG}, {@link #END_TAG}, 
542 	 *    {@link #PROC_INSTR}, 
543 	 *    {@link #ATTR_NAME}, {@link #WHITESP_IN_ATTR} 
544 	 *    and {@link #ATTR_VALUE}. ****** comment and &lt;!element missing. 
545 	 * @return 
546 	 *    a <code>StringBuffer</code> containing characters 
547 	 *    starting with the current one until one 
548 	 *    <code>charTester</code> returns <code>true</code>. 
549 	 * @exception IOException 
550 	 *    if an io-error occurs
551 	 * @exception SAXParseException 
552 	 *    if the parser faces the end of the stream 
553 	 *    while scanning the current element. 
554 	 */
555 	StringBuffer readStringBuffer(CharTester charTester, 
556 				      String elementName) 
557 	    throws IOException, SAXParseException {
558 
559 	    StringBuffer qName = new StringBuffer();
560 	    int numRead = 0;
561 	    do {
562 		numRead = readArray(charTester);
563 		if (numRead == -1) {
564 		    throw new SAXParseException
565 			("End of stream while scanning " 
566 			 + elementName + ". " 
567 			 + "Read so far: \"" 
568 			 + qName + QUOTE_DOT, null);
569 		}
570 		qName.append(getChars(),
571 			     getStartAndMove(),
572 			     numRead);
573 	    } while (isEmpty());
574 
575 	    return qName;
576 	}
577 
578 	/**
579 	 * Returns the buffer of <code>char</code>s. 
580 	 *
581 	 * @return 
582 	 *    the <code>char[]</code> {@link #bufferArray}. 
583 	 */
584 	char[] getChars() {
585 	    return this.bufferArray;
586 	}
587 
588 	/**
589 	 * Moves {@link #newStart} to {@link #start} 
590 	 * and returns the old value of {@link #start}. 
591 	 *
592 	 * @return 
593 	 *    the old <code>int</code> value of {@link #start}. 
594 	 */
595 	int getStartAndMove() {
596 	    int ret = this.start;
597 	    this.start = this.newStart;
598 	    return ret;
599 	}
600 
601 	/**
602 	 * Get method for {@link #start}. 
603 	 *
604 	 * @return {@link #start}
605 	 */
606 	int getStart() {
607 	    return this.start;
608 	}
609 
610 	/**
611 	 * Get method for {@link #end}. 
612 	 *
613 	 * @return {@link #end}
614 	 */
615 	int getEnd() {
616 	    return this.end;
617 	}
618     } // class Buffer
619 
620     /**
621      * Provides a bunch of methods fpr parsing 
622      * with implementations specific to xml and sgml. 
623      */
624     interface XMLsGMLspecifica {
625 	// **** SGMLParser.this.currChar must be the character 
626 	// after the attribute list. 
627 	/**
628 	 * Parses one attribute and adds it to the given attribute list. 
629 	 *
630 	 * @param attributes 
631 	 *    an <code>AttributesImpl</code> 
632 	 *    to which the attribute parsed is added. 
633 	 * @exception IOException 
634 	 *    if an io-error occurs
635 	 * @exception SAXException 
636 	 *    if a syntactical error occurs
637 	 */
638 	void parseAttribute(AttributesWrapper attributes) 
639 	    throws IOException, SAXException;
640 
641 	/**
642 	 * Parses a comment or any declaration 
643 	 * starting with <code>&lt;!...</code> and notifying the handler. 
644 	 *
645 	 * @exception IOException 
646 	 *    if an io-error occurs
647 	 * @exception SAXException 
648 	 *    if a syntactical error occurs
649 	 */
650 	void parseCommentElemTypeDecl() throws IOException, SAXException;
651 
652 	/**
653 	 * Parses a processing instruction or any declaration 
654 	 * starting with <code>&lt;?...</code> and notifying the handler. 
655 	 *
656 	 * @exception IOException 
657 	 *    if an io-error occurs
658 	 * @exception SAXException 
659 	 *    if a syntactical error occurs
660 	 */
661 	void parseExtProcessingInstruction() throws IOException, SAXException;
662 
663     } // interface XML_SGML_Specifica 
664 
665     /**
666      * Contains the <code>HTML</code>-specific part of the parser. 
667      */
668     private final XMLsGMLspecifica htmlAttributeParser = 
669     new XMLsGMLspecifica() {
670 
671 	public void parseAttribute(AttributesWrapper attributes) 
672 	    throws IOException, SAXException {
673 	    String attName;
674 	    String attValue;
675 	    StringBuffer qName;
676 
677 	    // Parse attribute name 
678 	    qName = SGMLParser.this.buffer.
679 		readStringBuffer(TEST_BLANK_EQUALS_GT, ATTR_NAME);
680 	    qName.insert(0, (char) SGMLParser.this.currChar);
681 	    attName = qName.toString().toLowerCase(Locale.ENGLISH);
682 //System.out.println("attName: |"+attName+"|");
683 
684 	    // Here, the attribute may have a value or not. 
685 
686 	    // Skip whitespace either after having parsed the attribute 
687 	    // or between its name and its value. 
688 	    SGMLParser.this.currChar = 
689 		SGMLParser.this.buffer.readChar(); //NOPMD
690 	    if (Character.isWhitespace((char) SGMLParser.this.currChar)) {
691 		qName = SGMLParser.this.buffer.
692 		    readStringBuffer(TEST_NO_WHITESPACE, WHITESP_IN_ATTR);
693 		SGMLParser.this.currChar = 
694 		    SGMLParser.this.buffer.readChar(); //NOPMD
695 	    }
696 
697 	    // Here is the decision whether a value is provided or not. 
698 	    if (SGMLParser.this.currChar != SYMB_EQ) {
699 		// Here, no value may be given 
700 		attributes.addAttribute(attName, AttributesImpl.NO_VALUE);
701 //System.out.println("attName: |"+attName+"|");
702 //System.out.println("noValue@@"+(char)SGMLParser.this.currChar+"|");
703 		return;
704 	    }
705 	    // Here, clearly a value must follow 
706 
707 	    // Skip whitespaces 
708 	    qName = SGMLParser.this.buffer.
709 		readStringBuffer(TEST_NO_WHITESPACE, WHITESP_IN_ATTR);
710 	    SGMLParser.this.currChar = 
711 		SGMLParser.this.buffer.readChar(); //NOPMD
712 
713 	    // Parse the attribute value. 
714 	    switch (SGMLParser.this.currChar) {
715 		case '\'':
716 		    // fall through 
717 		case '"':
718 		    // the attribute value is quoted. 
719 		    char quote = (char) SGMLParser.this.currChar;
720 		    TEST_SPEC.setChar(quote);
721 		    //SGMLParser.this.currChar = 
722 		    //	SGMLParser.this.buffer.readChar();
723 
724 //System.out.println("quote@@"+SGMLParser.this.currChar);
725 		    qName = new StringBuffer();
726 		    while (true) {
727 			qName.append(SGMLParser.this.buffer.
728 				     readStringBuffer(TEST_SPEC, ATTR_VALUE));
729 			if (qName.length() != 0 
730 			    && qName.charAt(qName.length() - 1) == '\\') {
731 			    qName.setCharAt(qName.length() - 1, quote);
732 			} else {
733 			    // read the quote 
734 			    SGMLParser.this.currChar = //NOPMD
735 				SGMLParser.this.buffer.readChar();
736 			    break;
737 			}
738 		    }
739 		    break;
740 		default:
741 //System.out.println("no quote@@"+SGMLParser.this.currChar);
742 		    // the attribute value is not quoted. 
743 		    qName = SGMLParser.this.buffer.
744 			readStringBuffer(TEST_BLANK_GT, ATTR_VALUE);
745 		    qName.insert(0, (char) SGMLParser.this.currChar);
746 		    break;
747 	    }
748 	    // read the character after the attribute value 
749 	    SGMLParser.this.currChar = 
750 		SGMLParser.this.buffer.readChar(); //NOPMD
751 
752 	    attValue = qName.toString();
753 	    attributes.addAttribute(attName, attValue);
754 //System.out.println("attName: |"+attName+"|");
755 //System.out.println("attValue: |"+attValue+"|");
756 	}
757 
758 	public void parseCommentElemTypeDecl() 
759 	    throws IOException, SAXException {
760 //System.out.println("comment?");
761 
762 	    SGMLParser.this.currChar = //NOPMD
763 		SGMLParser.this.buffer.readChar();
764 	    if (SGMLParser.this.currChar != SYMB_COMMENT) {
765 		//int numRead = 
766 		SGMLParser.this.buffer.readArray(TEST_GT);
767 		SGMLParser.this.buffer.getStartAndMove();
768 		return;
769 	    }
770 	    // Here, object starts with "<!-....."
771 
772 	    SGMLParser.this.currChar = //NOPMD
773 		SGMLParser.this.buffer.readChar();
774 	    if (SGMLParser.this.currChar != SYMB_COMMENT) {
775 		throw new SAXParseException
776 		    ("Comments must start with \"<!--\" but found " 
777 		     + "\"<!-" + (char) SGMLParser.this.currChar + QUOTE_DOT, 
778 		     null);
779 	    }
780 //System.out.println("comment!");
781 
782 	    int numRead = 0;
783 	    do {
784 		numRead = SGMLParser.this.buffer
785 		    .readArray(TEST_END_OF_COMMENT);
786 		if (numRead == -1) {
787 		    StringBuffer qName = new StringBuffer();
788 		    qName.append(SGMLParser.this.buffer.getChars(),
789 				 SGMLParser.this.buffer.getStartAndMove(),
790 				 numRead);
791 		    throw new SAXParseException
792 			("End of stream while scanning comment. " 
793 			 + "Recently read: \"" + qName + QUOTE_DOT, 
794 			 null);
795 		}
796 
797 		SGMLParser.this.buffer.getStartAndMove();
798 	    } while (SGMLParser.this.buffer.isEmpty());
799 /*
800 	    StringBuffer qName = new StringBuffer();
801 	    qName.append(SGMLParser.this.buffer.getChars(),
802 			 SGMLParser.this.buffer.getStartAndMove(),
803 			 numRead);
804 
805 	    System.out.println("read so far: |"+qName+"|");
806 */
807 
808 	    SGMLParser.this.buffer.getStartAndMove();	    
809 	    // NO NOTIFY!!
810 	}
811 
812 	public void parseExtProcessingInstruction() 
813 	    throws IOException, SAXException {
814 	    parseStartOrStartEndTag();
815 	}
816     }; // htmlXML_SGML_Specifica
817 
818     /**
819      * Contains the <code>XML</code>-specific part of the parser. 
820      */
821     private final XMLsGMLspecifica xmlAttributeParser = 
822     new XMLsGMLspecifica() {
823 
824 	public void parseAttribute(AttributesWrapper attributes) 
825 	    throws IOException, SAXException {
826 	    String attName;
827 	    String attValue;
828 	    StringBuffer qName;
829 
830 	    // Parse attribute name 
831 	    qName = SGMLParser.this.buffer.
832 		readStringBuffer(TEST_BLANK_EQUALS_GT, ATTR_NAME);
833 	    qName.insert(0, (char) SGMLParser.this.currChar);
834 	    attName = qName.toString();
835 //System.out.println("attName: |"+attName+"|");
836 
837 	    // Here, the attribute may have a value or not. 
838 
839 	    // Skip whitespace either after having parsed the attribute 
840 	    // or between its name and its value. 
841 	    SGMLParser.this.currChar = 
842 		SGMLParser.this.buffer.readChar(); //NOPMD
843 	    if (Character.isWhitespace((char) SGMLParser.this.currChar)) {
844 		qName = SGMLParser.this.buffer.
845 		    readStringBuffer(TEST_NO_WHITESPACE, WHITESP_IN_ATTR);
846 		SGMLParser.this.currChar = 
847 		    SGMLParser.this.buffer.readChar(); //NOPMD
848 	    }
849 
850 	    // Here is the decision whether a value is provided or not. 
851 	    if (SGMLParser.this.currChar != SYMB_EQ) {
852 		// Here, a value is missing. 
853 		throw new SAXParseException
854 		    ("Missing value for attribute \"" 
855 		     + attName + QUOTE_DOT, null);
856 	    }
857 	    // Here, clearly a value must follow ****
858 
859 
860 	    // Skip whitespaces 
861 	    qName = SGMLParser.this.buffer.
862 		readStringBuffer(TEST_NO_WHITESPACE, WHITESP_IN_ATTR);
863 	    SGMLParser.this.currChar = 
864 		SGMLParser.this.buffer.readChar(); //NOPMD
865 
866 
867 	    // Parse the attribute value. 
868 	    switch (SGMLParser.this.currChar) {
869 		case '\'':
870 		    // fall through 
871 		case '"':
872 		    // the attribute value is quoted. 
873 		    char quote = (char) SGMLParser.this.currChar;
874 		    TEST_SPEC.setChar(quote);
875 		    //SGMLParser.this.currChar = 
876 		    //	SGMLParser.this.buffer.readChar();
877 
878 //System.out.println("quote@@"+SGMLParser.this.currChar);
879 		    qName = new StringBuffer();
880 		    while (true) {
881 			qName.append(SGMLParser.this.buffer.
882 				     readStringBuffer(TEST_SPEC, ATTR_VALUE));
883 			if (qName.length() != 0 
884 			    && qName.charAt(qName.length() - 1) == '\\') {
885 			    qName.setCharAt(qName.length() - 1, quote);
886 			} else {
887 			    // read the quote 
888 			    SGMLParser.this.currChar = //NOPMD
889 				SGMLParser.this.buffer.readChar();
890 			    break;
891 			}
892 		    }
893 		    break;
894 		default:
895 		    throw new SAXParseException
896 			("Value of attribute \"" + attName + 
897 			 "\" is not quoted. ",
898 			 null);
899 	    }
900 	    // read the character after the attribute value 
901 	    SGMLParser.this.currChar = 
902 		SGMLParser.this.buffer.readChar(); //NOPMD
903 
904 	    attValue = qName.toString();
905 	    attributes.addAttribute(attName, attValue);
906 //System.out.println("attName: |"+attName+"|");
907 //System.out.println("attValue: |"+attValue+"|");
908 	}
909 
910 	public void parseCommentElemTypeDecl() 
911 	    throws IOException, SAXException {
912 	    // ******** comments will not work that way!!!*****
913 
914 	    SGMLParser.this.buffer.readStringBuffer(TEST_GT, PROC_INSTR);
915 	    //**** comment
916 	    // Here, also the empty processing instruction or comment 
917 	    // would be possible. 
918 	
919 	    //this.buffer.getStart();
920 //System.out.println("-qName: |"+qName+"|");
921 	    SGMLParser.this.currChar = 
922 		SGMLParser.this.buffer.readChar(); //NOPMD
923 	    //assert this.currChar == '>';
924 	    SGMLParser.this.contentHandler.processingInstruction(null, null);
925 	}
926 
927 
928 	public void parseExtProcessingInstruction() 
929 	    throws IOException, SAXException {
930 
931 	    //StringBuffer qName = 
932 	    SGMLParser.this.buffer.readStringBuffer(TEST_GT, PROC_INSTR);
933 	    // Here, also the empty processing instruction would be possible. 
934 	
935 	    //this.buffer.getStart();
936 //System.out.println("-qName: |"+qName+"|");
937 	    SGMLParser.this.currChar = 
938 		SGMLParser.this.buffer.readChar(); //NOPMD
939 	    //assert this.currChar == '>';
940 	    SGMLParser.this.contentHandler.processingInstruction(null, null);
941 	}
942 
943     }; // xmlXML_SGML_Specifica
944 
945     /* --------------------------------------------------------------------- *
946      * class constants                                                       *
947      * --------------------------------------------------------------------- */
948 
949 
950     /**
951      * The size of the buffer used internally. 
952      * This must be at least <code>1</code>. 
953      * I found no significant difference in speed when increasing this number. 
954      * The buffer coming from a stream from a URL seems to hav maximal size 
955      * of <code>1448</code> whereas for file streams there seems no bound. 
956      * In the cases considered, the file is read in as a whole. 
957      */
958     private static final int BUFFER_SIZE = 999999;
959 
960     // for notification of a sax parse exception with Buffer.readStringBuffer. 
961     /**
962      * Short string representation of the object currently parsed. 
963      * Contains the specific part of the message of the exception 
964      * that may be thrown by {@link SGMLParser.Buffer#readStringBuffer}. 
965      */
966     private static final String START_TAG = "start tag";
967 
968     /**
969      * Short string representation of the object currently parsed. 
970      * Contains the specific part of the message of the exception 
971      * that may be thrown by {@link SGMLParser.Buffer#readStringBuffer}. 
972      */
973     private static final String END_TAG = "end tag";
974 
975     /**
976      * Short string representation of the object currently parsed. 
977      * Contains the specific part of the message of the exception 
978      * that may be thrown by {@link SGMLParser.Buffer#readStringBuffer}. 
979      */
980     private static final String PROC_INSTR = "processing instruction";
981 
982     /**
983      * Short string representation of the object currently parsed. 
984      * Contains the specific part of the message of the exception 
985      * that may be thrown by {@link SGMLParser.Buffer#readStringBuffer}. 
986      */
987     private static final String ATTR_NAME = "attribute name";
988 
989     /**
990      * Short string representation of the object currently parsed. 
991      * Contains the specific part of the message of the exception 
992      * that may be thrown by {@link SGMLParser.Buffer#readStringBuffer}. 
993      */
994     private static final String WHITESP_IN_ATTR = "whitespace in attribute";
995 
996     /**
997      * Short string representation of the object currently parsed. 
998      * Contains the specific part of the message of the exception 
999      * that may be thrown by {@link SGMLParser.Buffer#readStringBuffer}. 
1000      */
1001     private static final String ATTR_VALUE = "attribute value";
1002 
1003     /* --------------------------------------------------------------------- *
1004      * fields                                                                *
1005      * --------------------------------------------------------------------- */
1006 
1007     /**
1008      * Contains class with methods specific for xml and sgml, respectively. 
1009      */
1010     private XMLsGMLspecifica xmlSgmlSpecifica = htmlAttributeParser;
1011 
1012     /**
1013      * The current character or <code>-1</code> 
1014      * to signfy the end of the stream. 
1015      */
1016     private int currChar;
1017 
1018     /**
1019      * The <code>ContentHandler</code> registered. 
1020      */
1021     private ContentHandler contentHandler;
1022 
1023     /**
1024      * The <code>ParseExceptionHandler</code> registered. 
1025      */
1026     private ParseExceptionHandler parseExceptionHandler;
1027 
1028     /**
1029      * The buffer of the input stream. 
1030      */
1031     private Buffer buffer;
1032 
1033     /* --------------------------------------------------------------------- *
1034      * constructors                                                          *
1035      * --------------------------------------------------------------------- */
1036 
1037     /**
1038      * Creates a new <code>SGMLParser</code> 
1039      * with the default handlers for content and exceptions. 
1040      */
1041     @SuppressWarnings("checkstyle:nowhitespaceafter")
1042     public SGMLParser() {
1043 	this.       contentHandler = new TrivialContentHandler();
1044 	this.parseExceptionHandler = new ParseExceptionHandler.Impl();
1045     }
1046 
1047     /* --------------------------------------------------------------------- *
1048      * methods                                                               *
1049      * --------------------------------------------------------------------- */
1050 
1051 
1052     /**
1053      * Parses the <code>InputSource</code> given 
1054      * but delegates everything inside a tag or a processing instruction 
1055      * to {@link #parseTagOrPI}. 
1056      *
1057      * @param src 
1058      *    an <code>InputSource</code>. 
1059      * @exception IOException if an error occurs
1060      * @exception SAXException if an error occurs
1061      */
1062     void parse(InputSource src) throws IOException, SAXException {
1063 	parse(src.getCharacterStream());
1064     }
1065 
1066     /**
1067      * Parses the given <code>InputStream</code>. 
1068      *
1069      * @param reader 
1070      *     an <code>Reader</code> sequentializing an SGML document. 
1071      * @exception IOException 
1072      *     if an error reading the stream occurs. 
1073      * @exception SAXException 
1074      *    if an error with the sgml-syntax occurs. 
1075      */
1076     public void parse(Reader reader) throws IOException, SAXException {
1077 
1078 	this.buffer = new Buffer(reader, BUFFER_SIZE);
1079 	int numRead = this.buffer.readArray(TEST_LT);
1080 	// notify handler that first part of document was successfully read. 
1081 	this.contentHandler.startDocument();
1082 	while (numRead != -1) {
1083 	    this.currChar = this.buffer.readChar(); // the '<' char? 
1084 	    if (this.currChar == SYMB_TAG) {
1085 		// a tag or a PI. 
1086 		numRead = parseTagOrPI();
1087 	    } else {
1088 		// either characters or ignoreableWhitespace
1089 		numRead = parseText();
1090 	    }
1091 	}
1092 	// Here, the document is finished. 
1093 	this.contentHandler.endDocument();
1094     }
1095 
1096     /**
1097      * Parses everything outside a tag, a processing instruction, ... 
1098      * everything within brackets <code>&lt;</code> and <code>&gt;</code>. 
1099      * ***** Missing: distinction between notification 
1100      * of characters and whitespace. ****
1101      *
1102      * @exception IOException 
1103      *     if an error reading the stream occurs. 
1104      * @exception SAXException 
1105      *    if an error with the sgml-syntax occurs. 
1106      * @see #parseTagOrPI
1107      */
1108     private int parseText() throws IOException, SAXException {
1109 	int numRead = this.buffer.readArray(TEST_LT);
1110 	if (numRead != -1) {
1111 /*
1112   System.out.println("text: |"+new String(buffer.getChars(),
1113   buffer.getStartAndMove(),
1114   numRead)+"|");
1115 */
1116 	    this.contentHandler.characters(this.buffer.getChars(),
1117 					   this.buffer.getStartAndMove(),
1118 					   numRead);
1119 	}
1120 	
1121 //buffer.getStartAndMove();
1122 	return numRead;
1123     }
1124 
1125     /**
1126      * Parses an end-tag notifying the underlying handler. 
1127      *
1128      * @exception IOException 
1129      *     if an error reading the stream occurs. 
1130      * @exception SAXException 
1131      *    if an error with the sgml-syntax occurs. 
1132      */
1133     void parseEndTag() throws IOException, SAXException {
1134 	StringBuffer qName = this.buffer.readStringBuffer(TEST_GT, END_TAG);
1135 	// Here, also the empty tag would be possible. 
1136 	
1137 	//this.buffer.getStart();
1138 //System.out.println("end tag: |"+qName+"|");
1139 	this.currChar = this.buffer.readChar();
1140 	//assert this.currChar == '>';
1141 	this.contentHandler.endElement(null,
1142 				       null,
1143 				       qName.toString());
1144 	this.currChar = this.buffer.readChar();
1145     }
1146 /*
1147   public void parseCommentElemTypeDecl() 
1148   throws IOException, SAXException {
1149   // ******** comments will not work that way!!!*****
1150 
1151   StringBuffer qName = this.buffer
1152   .readStringBuffer(TEST_GT, PROC_INSTR);//**** comment
1153   // Here, also the empty processing instruction or comment 
1154   // would be possible. 
1155 	
1156   //this.buffer.getStart();
1157 //System.out.println("-qName: |"+qName+"|");
1158 this.currChar = this.buffer.readChar();
1159 //assert this.currChar == '>';
1160 this.handler.processingInstruction(null, null);
1161 }
1162 */
1163 
1164     /**
1165      * Parses a start-tag or, for xml, an empty tag. 
1166      *
1167      * @exception IOException 
1168      *     if an error reading the stream occurs. 
1169      * @exception SAXException 
1170      *    if an error with the sgml-syntax occurs. 
1171      */
1172     void parseStartOrStartEndTag() throws IOException, SAXException {
1173 
1174 	// ***** Better read the name of the tag and 
1175 	// then single out problems with chars by a handler
1176 	if (!Character.isLetter((char) this.currChar)) {
1177 	    this.parseExceptionHandler
1178 		.foundIllegalCharInTag((char) this.currChar);
1179 	    // Ignore the previously read char. 
1180 	    this.currChar = this.buffer.readChar();
1181 	}
1182 
1183 
1184 	StringBuffer qName = this.buffer
1185 	    .readStringBuffer(TEST_BLANK_GT_SLASH, START_TAG);
1186 	qName.insert(0, (char) this.currChar);
1187 	// Here, also the empty tag would be possible. 
1188 //System.out.println("start tag: |"+qName+"|");
1189 
1190 	// Skip whitespaces 
1191 	this.currChar = this.buffer.readChar();
1192 	while (Character.isWhitespace((char) this.currChar)) {
1193 	    this.buffer.
1194 		readStringBuffer(TEST_NO_WHITESPACE, WHITESP_IN_ATTR);
1195 	    this.currChar = this.buffer.readChar();
1196 	}
1197 
1198 	AttributesWrapper attributesWrapper = new AttributesWrapper();
1199 	// Here, either /, > or an attribute occurs
1200 //System.out.println("this.currChar: |"+(char)this.currChar+"|");
1201 	while (this.currChar != '/' && this.currChar != '>') {
1202 	    // parse the following attribute list
1203 	    this.xmlSgmlSpecifica.parseAttribute(attributesWrapper);
1204 
1205 	    // Skip whitespaces 
1206 	    while (Character.isWhitespace((char) this.currChar)) {
1207 		this.buffer
1208 		    .readStringBuffer(TEST_NO_WHITESPACE, WHITESP_IN_ATTR);
1209 		this.currChar = this.buffer.readChar();
1210 	    }
1211 	} // end parsing attribute list 
1212 //System.out.println("-this.currChar: |"+(char)this.currChar+"|");
1213 
1214 
1215 	Attributes attributes = attributesWrapper.getAttributes();
1216 	switch (this.currChar) {
1217 	    case '/':
1218 		// start-end-tag called empty tag
1219 
1220 		// skip illegal characters between "/" and ">" **** 
1221 	skipped: while (true) { //NOPMD
1222 		    this.currChar = this.buffer.readChar();
1223 		    switch (this.currChar) {
1224 			case '>':
1225 			    break skipped;
1226 			case -1:
1227 			    this.parseExceptionHandler
1228 				.foundUnexpectedEndOfDocument();
1229 				break;
1230 			default:
1231 			    this.parseExceptionHandler
1232 				.foundCharAfterEndOfEndTag
1233 				((char) this.currChar);
1234 			    break;
1235 		    } // switch
1236 		}
1237 
1238 		this.contentHandler.startElement(null,
1239 						 null,
1240 						 qName.toString(),
1241 						 attributes);
1242 		this.contentHandler.endElement(null,
1243 					       null,
1244 					       qName.toString());
1245 		break;
1246 	    case '>':
1247 		this.contentHandler.startElement(null,
1248 						 null,
1249 						 qName.toString(),
1250 						 attributes);
1251 		break;
1252 	    default:
1253 		throw new SAXParseException
1254 		    ("Expected finishing tag \"" + qName 
1255 		     + "\" with character '/' or '>' " 
1256 		     + "but found '" + (char) this.currChar + "'. ", null);
1257 	}
1258     }
1259 
1260     /**
1261      * Parses everything within a tag, a processing instruction, ... 
1262      * everything within brackets <code>&lt;</code> and <code>&gt;</code>. 
1263      *
1264      * @see #parseText
1265      */
1266     private int parseTagOrPI() throws IOException, SAXException {
1267 //System.out.println("parseTagOrPI");
1268 	
1269 	this.currChar = this.buffer.readChar();
1270 	//this.currChar = this.reader.read();
1271 	switch (this.currChar) {
1272 	    case '/':
1273 		// parsing an end-tag 
1274 		parseEndTag();
1275 		//this.currChar = this.reader.read();
1276 		break;
1277 	    case '!':
1278 		// parsing no tag at all: 
1279 		// a processing instruction or a comment
1280 		this.xmlSgmlSpecifica.parseCommentElemTypeDecl();
1281 		//this.currChar = this.reader.read();
1282 		break;
1283 	    case '?':
1284 		// parsing no tag at all: 
1285 		// a processing instruction or a comment
1286 		this.xmlSgmlSpecifica.parseExtProcessingInstruction();
1287 		//this.currChar = this.reader.read();
1288 		break;
1289 	    default:
1290 		// parsing a start-tag or an empty-element-tag 
1291 		parseStartOrStartEndTag();
1292 		break;
1293 	} // end of switch ()
1294 	//this.currChar = this.buffer.readChar();
1295 	// Here, the buffer is ready 
1296 	// to read the first character. after the generalized tag. 
1297 
1298 //System.out.println("read last: |"+(char)this.currChar+"|");
1299 //System.out.println("read last: |"+      this.currChar+"|");
1300 
1301 	return 1;
1302     }
1303 
1304     /**
1305      * Sets {@link #contentHandler}. 
1306      *
1307      * @param contentHandler 
1308      *    a <code>ContentHandler</code>. 
1309      */
1310     public void setContentHandler(ContentHandler contentHandler) {
1311 	if (isXMLParser()) {
1312 	    this.contentHandler = contentHandler;
1313 	} else {
1314 	    this.contentHandler = new SGMLFilter(contentHandler);
1315 	}
1316     }
1317 
1318     /**
1319      * Returns {@link #contentHandler}. 
1320      *
1321      * @return
1322      *    the <code>ContentHandler</code> {@link #contentHandler}. 
1323      */
1324     public ContentHandler getContentHandler() {
1325 	if (isXMLParser()) {
1326 	    return this.contentHandler;
1327 	} else {
1328 	    return ((SGMLFilter) this.contentHandler).getWrapped();
1329 	}
1330     }
1331 
1332     /**
1333      * Sets {@link #parseExceptionHandler}. 
1334      *
1335      * @param peHandler 
1336      *    a <code>ParseExceptionHandler</code>. 
1337      */
1338     public void setExceptionHandler(ParseExceptionHandler peHandler) {
1339 	this.parseExceptionHandler = peHandler;
1340     }
1341 
1342     /**
1343      * Returns {@link #parseExceptionHandler}. 
1344      *
1345      * @return
1346      *    the <code>ContentHandler</code> {@link #parseExceptionHandler}. 
1347      */
1348     public ParseExceptionHandler getExceptionHandler() {
1349 	return this.parseExceptionHandler;
1350     }
1351 
1352     /**
1353      * Sets whether this parser is used as an xml-parser. 
1354      * If this is false, which is the default, 
1355      * it s an html-parser. 
1356      *
1357      * @param xml 
1358      *    a <code>boolean</code> value signifying 
1359      *    whether this parser will be used as an xml-parser in the sequel. 
1360      * @return 
1361      *    a <code>boolean</code> value signifying 
1362      *    whether before invoking this method 
1363      *    this parser was used as an xml-parser
1364      */
1365     public boolean parseXML(boolean xml) {
1366 	boolean result = this.xmlSgmlSpecifica == xmlAttributeParser;
1367 	this.xmlSgmlSpecifica = xml 
1368 	    ? xmlAttributeParser 
1369 	    : htmlAttributeParser;
1370 	return result;
1371     }
1372 
1373     public boolean isXMLParser() {
1374 	return this.xmlSgmlSpecifica == xmlAttributeParser;
1375     }
1376 }