View Javadoc
1   
2   
3   package eu.simuline.util.sgml;
4   
5   import eu.simuline.util.ListMap;
6   
7   import java.io.Reader;
8   import java.io.IOException;
9   
10  import org.xml.sax.ContentHandler;
11  import org.xml.sax.Locator;
12  import org.xml.sax.Attributes;
13  import org.xml.sax.InputSource;
14  import org.xml.sax.SAXException;
15  import org.xml.sax.SAXParseException;
16  import java.util.Locale;
17  
18  /**
19   * A rudimentary <code>SGML</code> parser with something like a SAX-api. 
20   *
21   * @author <a href="mailto:ernst.reissner@simuline.eu">Ernst Reissner</a>
22   * @version 1.0
23   */
24  public final class SGMLParser {
25  
26      private static final String QUOTE_DOT = "\". ";
27      private static final char   SYMB_EQ   = '=';
28      private static final char   SYMB_COMMENT = '-';
29      private static final char   SYMB_TAG = '<';
30  
31      /* --------------------------------------------------------------------- *
32       * inner classes                                                         *
33       * --------------------------------------------------------------------- */
34  
35      /**
36       * A <code>ContentHandler</code> which simply ignores all events.  
37       * May be used for debugging. 
38       */
39      static class TrivialContentHandler implements ContentHandler {
40  
41  	/** <!-- api-docs inherited from interface implemented.  -->*/ 
42  	public void setDocumentLocator(Locator locator) {
43  	    // is empty. 
44  	}
45  
46  	public void startDocument() throws SAXException {
47  	    // is empty. 
48  	}
49  
50  	public void endDocument() throws SAXException {
51  	    // is empty. 
52  	}
53  
54  	public void startPrefixMapping(String prefix,
55  				       String uri)
56  	    throws SAXException {
57  	    // is empty. 
58  	}
59  
60  	public void endPrefixMapping(String prefix)
61  	    throws SAXException {
62  	    // is empty. 
63  	}
64  
65  	public void startElement(String namespaceURI,
66  				 String localName,
67  				 String qName,
68  				 Attributes atts)
69  	    throws SAXException {
70  	    // is empty. 
71  	}
72  
73  	public void endElement(String namespaceURI,
74  			       String localName,
75  			       String qName)
76  	    throws SAXException {
77  	    // is empty. 
78  	}
79  
80  	public void characters(char[] chr,
81  			       int start,
82  			       int length)
83  	    throws SAXException {
84  	    // is empty. 
85  	}
86  
87  	public void ignorableWhitespace(char[] chr,
88  					int start,
89  					int length)
90  	    throws SAXException {
91  	    // is empty. 
92  	}
93  
94  	public void processingInstruction(String target,
95  					  String data)
96  	    throws SAXException {
97  	    // is empty. 
98  	}
99  
100 	public void skippedEntity(String name)
101 	    throws SAXException {
102 	    // is empty. 
103 	}
104     } // class TrivialContentHandler 
105 
106 
107     /**
108      * An **** partial **** implementation 
109      * of the SAX-interface <code>Attributes</code> 
110      * which allows to set name-value-pairs by method {@link #addAttribute}. 
111      */
112     class AttributesWrapper {
113 
114 	/* ----------------------------------------------------------------- *
115 	 * fields                                                            *
116 	 * ----------------------------------------------------------------- */
117 
118 	/**
119 	 * See {@link AttributesImpl#name2value}. 
120 	 */
121 	private final ListMap<String, String> name2value;
122 
123 	/* ----------------------------------------------------------------- *
124 	 * constructors                                                      *
125 	 * ----------------------------------------------------------------- */
126 
127 	/**
128 	 * Creates a new empty <code>AttributesWrapper</code> 
129 	 * which represents an empty attribute list. 
130 	 */
131 	AttributesWrapper() {
132 	    this.name2value = new ListMap<String, String>();
133 	}
134 
135 	/* ----------------------------------------------------------------- *
136 	 * methods                                                           *
137 	 * ----------------------------------------------------------------- */
138 
139 
140 	/**
141 	 * Adds an attribute with the given name and value. 
142 	 *
143 	 * @param attName 
144 	 *     the <code>String</code> representation 
145 	 *     of the name of an attribute. 
146 	 * @param attValue 
147 	 *     the value of an attribute as a <code>String</code>. 
148 	 *     If no value is provided, 
149 	 *     this is {@link AttributesImpl#NO_VALUE}. 
150 	 */
151 	void addAttribute(String attName, String attValue) {
152 	    String oldAttValue = this.name2value
153 		.put(attName, attValue);
154 	    if (oldAttValue != null) {
155 		// Here, the attribute has occured before. 
156 		SGMLParser.this.parseExceptionHandler
157 		    .foundMultipleAttribute(attName,
158 					    oldAttValue);
159 	    }
160  	}
161 
162 	Attributes getAttributes() {
163 	    return new AttributesImpl(this.name2value);
164 	}
165     } // class AttributesWrapper 
166 
167 
168     /**
169      * Provides a single method which decides whether the given character 
170      * passes a certain test. 
171      */
172     interface CharTester {
173 
174 	/**
175 	 * Returns whether the given character <code>ch</code> 
176 	 * passes the test given by this <code>CharTester</code>. 
177 	 *
178 	 * @param chr 
179 	 *    an <code>int</code> value representing a character. 
180 	 * @return the <code>boolean</code> 
181 	 *    signifying whether the given character <code>ch</code> 
182 	 *    passes the test given by this <code>CharTester</code>. 
183 	 */
184 	boolean testChar(char chr);
185 
186     } // interface CharTester 
187 
188     /**
189      * Tests for blank, <code>/</code>, <code>&gt;</code>. 
190      */
191     private static final CharTester TEST_BLANK_GT_SLASH = new CharTester() {
192 	public boolean testChar(char chr) {
193 	    return Character.isWhitespace(chr) 
194 		|| chr == '/' 
195 		|| chr == '>';
196 	}
197     };
198 
199     /**
200      * Tests for blank or <code>&gt;</code>. 
201      */
202     private static final CharTester TEST_BLANK_GT = new CharTester() {
203 	public boolean testChar(char chr) {
204 	    return Character.isWhitespace(chr) 
205 		|| chr == '>';
206 	}
207     };
208 
209     /*
210      * Tests for <code>/</code> or <code>&gt;</code>. 
211      */
212 /*
213     private static final CharTester TEST_GT_SLASH = new CharTester() {
214 	    public boolean testChar(char ch) {
215 		return ch == '/' 
216 		    || ch == '>';
217 	    }
218 	};
219 */
220 
221     /**
222      * Tests for <code>&lt;</code>. 
223      */
224     private static final CharTester TEST_LT = new CharTester() {
225 	    public boolean testChar(char chr) {
226 		return chr == '<';
227 	    }
228 	};
229 
230     /**
231      * Tests for <code>&gt;</code>. 
232      */
233     private static final CharTester TEST_GT = new CharTester() {
234 	    public boolean testChar(char chr) {
235 		return chr == '>';
236 	    }
237 	};
238 
239     /**
240      * Tests for <code>=</code> and for <code>&gt;</code>. 
241      */
242     private static final CharTester TEST_BLANK_EQUALS_GT = new CharTester() {
243 	    public boolean testChar(char chr) {
244 		return Character.isWhitespace(chr) 
245 		    || chr == '='
246 		    || chr == '>';
247 	    }
248 	};
249 
250     /**
251      * Tests for whitespace. 
252      */
253     private static final CharTester TEST_NO_WHITESPACE = new CharTester() {
254 	    public boolean testChar(char chr) {
255 		return !Character.isWhitespace(chr);
256 	    }
257 	};
258 
259     /**
260      * Tests for quote both for<code>'</code> and for <code>"</code>. 
261      */
262 /*
263     private static final CharTester TEST_QUOTE = new CharTester() {
264 	    public boolean testChar(char chr) {
265 		return chr == '\'' 
266 		    || chr == '"';
267 	    }
268 	};
269 */
270 
271     /*
272      * Tests for end of comment <code>--></code>. 
273      * This tests for a sequence of characters 
274      * and confirms after having read the last one. 
275      */
276     private static final CharTester TEST_END_OF_COMMENT = new CharTester() {
277 
278 	    /**
279 	     * Contains the sequence <code>--></code> 
280 	     * representing the end of a comment. 
281 	     */
282 	    static final String END_OF_COMMENT = "-->";
283 
284 	    /**
285 	     * Contains the index in {@link #END_OF_COMMENT} 
286 	     * which is to be compared next by {@link #testChar}. 
287 	     */
288 	    private int index = 0;
289 
290 	    /**
291 	     * Returns whether the last characters tested 
292 	     * are <code>--></code>. 
293 	     *
294 	     * @param chr 
295 	     *    a <code>char</code>. 
296 	     * @return
297 	     *    whether the last characters tested 
298 	     *    including <code>char</code> are <code>--></code>. 
299 	     *    In particular, if less than three characters are read 
300 	     *    this is <code>false</code>. 
301 	     */
302 	    public boolean testChar(char chr) {		
303 		if (END_OF_COMMENT.charAt(this.index) == chr) {
304 		    this.index++;
305 		    if (this.index == END_OF_COMMENT.length() - 1) {
306 			this.index = 0;
307 			return true;
308 		    } else {
309 			return false;
310 		    }
311 		} else {
312 		    this.index = 0;
313 		    return false;
314 		}
315 	    }
316 	}; // TEST_END_OF_COMMENT 
317 
318     /**
319      * A <code>CharTester</code> which allows to specify 
320      * the character which passes the test. 
321      */
322     static class SpecCharTester implements CharTester {
323 
324 	/**
325 	 * The character which passes the test {@link #testChar}. 
326 	 */
327 	private char chr;
328 
329 	/**
330 	 * Sets {@link #chr} to the specified character value. 
331 	 *
332 	 * @param chr 
333 	 *    a <code>char</code> value. 
334 	 */
335 	void setChar(char chr) {
336 	    this.chr = chr;
337 	}
338 
339 	/**
340 	 * Returns whether the given character coincides with {@link #chr}. 
341 	 *
342 	 * @param chr 
343 	 *    a <code>char</code> value. 
344 	 * @return 
345 	 *    whether <code>ch</code> coincides with {@link #chr}. 
346 	 */
347 	public boolean testChar(char chr) {
348 	    return chr == this.chr;
349 	}
350     } // SpecCharTester 
351 
352     /**
353      * Tests for a specified character. 
354      * This is used for quotes which allow the cases 
355      * <code>'</code> and <code>"</code>. 
356      *
357      * @see XMLsGMLspecifica#parseAttribute
358      */
359     private static final SpecCharTester TEST_SPEC = new SpecCharTester();
360 
361     /**
362      * Class which buffers the read stream. 
363      */
364     static class Buffer {
365 
366 	/* ----------------------------------------------------------------- *
367 	 * fields                                                            *
368 	 * ----------------------------------------------------------------- */
369 
370 	/**
371 	 * The reader buffered. 
372 	 */
373 	private final Reader reader;
374 
375 	/**
376 	 * The current buffer. 
377 	 * The current parts to be read start with 
378 	 * <code>bufferArray[{@link #start}]</code> and end with 
379 	 * <code>bufferArray[{@link #end}]</code>, exclusively. 
380 	 */
381 	private final char[] bufferArray;
382 
383 	/**
384 	 * The first index in {@link #bufferArray} 
385 	 * read in from {@link #reader} but not returned 
386 	 * by {@link #readArray} or {@link #readChar}. 
387 	 */
388 	private int start;
389 
390 	/**
391 	 * Set by {@link #readArray} and read by {@link #getStartAndMove}. 
392 	 * When invoking {@link #readArray} <code>newStart</code> 
393 	 * is set to {@link #start} and increased 
394 	 * by the number of read charactersincreases. 
395 	 * Then {@link #getStartAndMove} updates {@link #start} 
396 	 * according to <code>newStart</code>. 
397 	 */
398 	private int newStart;
399 
400 	/**
401 	 * The first index in {@link #bufferArray} not read 
402 	 * from {@link #reader} 
403 	 * or <code>-1</code> if the end of the stream is reached. 
404 	 * This means that <code>bufferArray[end]</code> 
405 	 * either does not exist or at least is not significant. 
406 	 */
407 	private int end;
408 
409 	/* ----------------------------------------------------------------- *
410 	 * constructors                                                      *
411 	 * ----------------------------------------------------------------- */
412 
413 	/**
414 	 * Creates a new <code>Buffer</code> from the given reader 
415 	 * with the given size. 
416 	 *
417 	 * @param reader 
418 	 *    the <code>Reader</code> to be buffered. 
419 	 * @param length 
420 	 *    the length of the buffer. 
421 	 * @exception IOException 
422 	 *    if an error occurs 
423 	 */
424 	Buffer(Reader reader, int length) throws IOException {
425 	    this.reader = reader;
426 	    this.bufferArray = new char[length];
427 	    this.start = 0;
428 	    this.end = this.start; // signifies: reading necessary. 
429 	}
430 
431 	/* ----------------------------------------------------------------- *
432 	 * methods                                                           *
433 	 * ----------------------------------------------------------------- */
434 
435 
436 	/**
437 	 * Returns whether this buffer is currently empty. 
438 	 * When this is the case and someone tries to read further characters 
439 	 * this will lead to a trial 
440 	 * to read further pieces from {@link #reader}. 
441 	 *
442 	 * @return a <code>boolean</code> value 
443 	 *    signifying whether this buffer is currently empty. 
444 	 */
445 	boolean isEmpty() {
446 	    return this.end == this.start;
447 	}
448 
449 	/**
450 	 * Returns whether the end of the stream is reached. 
451 	 *
452 	 * @return 
453 	 *    a <code>boolean</code> specifying 
454 	 *    whether the end of the stream is reached. 
455 	 */
456 	boolean reachedEOS() {
457 	    return this.end == -1;
458 	}
459 
460 	/**
461 	 * Reads a single <code>char</code> and returns it. 
462 	 *
463 	 * @return 
464 	 *    an <code>int</code> value 
465 	 *    which is either the next <code>char</code> read in 
466 	 *    or <code>-1</code> which signifies the end of the stream. 
467 	 * @exception IOException 
468 	 *    if an error occurs
469 	 */
470 	int readChar() throws IOException {
471 	    if (reachedEOS()) {
472 		return -1;
473 	    }
474 	    if (isEmpty()) {
475 		this.start = 0;
476 		this.end = this.reader.read(this.bufferArray);
477 		if (reachedEOS()) {
478 		    return -1;
479 		}
480 	    }
481 	    return this.bufferArray[this.start++];
482 	}
483 
484 	/**
485 	 * Reads an array from {@link #reader}. 
486 	 * As a side effect, writes the field {@link #newStart}. 
487 	 * Also, if the portion of {@link #bufferArray} 
488 	 * to be read, i.e. between {@link #start} and {@link #end}, 
489 	 * is empty, a new portion is buffered. 
490 	 *
491 	 * @param charTester 
492 	 *    a <code>CharTester</code> which signifies 
493 	 *    when to end reading from the buffer. 
494 	 * @return 
495 	 *    an <code>int</code> signifying the number of <code>char</code>s 
496 	 *    read or <code>-1</code> which signifies the end of the stream. 
497 	 *    It is read to the next &lt; or, if there is none, 
498 	 *    to the end of the stream. 
499 	 *    Thus there is a difference between the return values 
500 	 *    <code>-1</code> and <code>0</code>. 
501 	 * @exception IOException 
502 	 *    if an error occurs
503 	 */
504 	int readArray(CharTester charTester) throws IOException {
505 	    if (reachedEOS()) {
506 		return -1;
507 	    }
508 	    if (isEmpty()) {
509 		this.start = 0;
510 		this.end = this.reader.read(this.bufferArray);
511 //System.out.println("read: "+this.end);
512 		if (reachedEOS()) {
513 		    return -1;
514 		}
515 	    }
516 
517 	    for (int i = this.start; i < this.end; i++) {
518 		if (charTester.testChar(this.bufferArray[i])) {
519 		    // found match described by charTester 
520 		    this.newStart = i;
521 		    return this.newStart - this.start;
522 		}
523 	    }
524             // Here, the test always failed. 
525 	    this.newStart = this.end;
526 	    return this.end - this.start;
527 	}
528 
529 	/**
530 	 * Describe <code>readStringBuffer</code> method here.
531 	 *
532 	 * @param charTester 
533 	 *    a <code>CharTester</code> which determines 
534 	 *    the first character not read 
535 	 *    into the resulting <code>StringBuffer</code>. 
536 	 * @param elementName
537 	 *    a <code>String</code> which determines 
538 	 *    the element under consideration. 
539 	 *    This is only used for generating the message of a 
540 	 *    <code>SAXParseException</code>. 
541 	 *    <p>
542 	 *    Allowed values: {@link #START_TAG}, {@link #END_TAG}, 
543 	 *    {@link #PROC_INSTR}, 
544 	 *    {@link #ATTR_NAME}, {@link #WHITESP_IN_ATTR} 
545 	 *    and {@link #ATTR_VALUE}. ****** comment and &lt;!element missing. 
546 	 * @return 
547 	 *    a <code>StringBuffer</code> containing characters 
548 	 *    starting with the current one until one 
549 	 *    <code>charTester</code> returns <code>true</code>. 
550 	 * @exception IOException 
551 	 *    if an io-error occurs
552 	 * @exception SAXParseException 
553 	 *    if the parser faces the end of the stream 
554 	 *    while scanning the current element. 
555 	 */
556 	StringBuffer readStringBuffer(CharTester charTester, 
557 				      String elementName) 
558 	    throws IOException, SAXParseException {
559 
560 	    StringBuffer qName = new StringBuffer();
561 	    int numRead = 0;
562 	    do {
563 		numRead = readArray(charTester);
564 		if (numRead == -1) {
565 		    throw new SAXParseException
566 			("End of stream while scanning " 
567 			 + elementName + ". " 
568 			 + "Read so far: \"" 
569 			 + qName + QUOTE_DOT, null);
570 		}
571 		qName.append(getChars(),
572 			     getStartAndMove(),
573 			     numRead);
574 	    } while (isEmpty());
575 
576 	    return qName;
577 	}
578 
579 	/**
580 	 * Returns the buffer of <code>char</code>s. 
581 	 *
582 	 * @return 
583 	 *    the <code>char[]</code> {@link #bufferArray}. 
584 	 */
585 	char[] getChars() {
586 	    return this.bufferArray;
587 	}
588 
589 	/**
590 	 * Moves {@link #newStart} to {@link #start} 
591 	 * and returns the old value of {@link #start}. 
592 	 *
593 	 * @return 
594 	 *    the old <code>int</code> value of {@link #start}. 
595 	 */
596 	int getStartAndMove() {
597 	    int ret = this.start;
598 	    this.start = this.newStart;
599 	    return ret;
600 	}
601 
602 	/**
603 	 * Get method for {@link #start}. 
604 	 *
605 	 * @return {@link #start}
606 	 */
607 	int getStart() {
608 	    return this.start;
609 	}
610 
611 	/**
612 	 * Get method for {@link #end}. 
613 	 *
614 	 * @return {@link #end}
615 	 */
616 	int getEnd() {
617 	    return this.end;
618 	}
619     } // class Buffer
620 
621     /**
622      * Provides a bunch of methods fpr parsing 
623      * with implementations specific to xml and sgml. 
624      */
625     interface XMLsGMLspecifica {
626 	// **** SGMLParser.this.currChar must be the character 
627 	// after the attribute list. 
628 	/**
629 	 * Parses one attribute and adds it to the given attribute list. 
630 	 *
631 	 * @param attributes 
632 	 *    an <code>AttributesImpl</code> 
633 	 *    to which the attribute parsed is added. 
634 	 * @exception IOException 
635 	 *    if an io-error occurs
636 	 * @exception SAXException 
637 	 *    if a syntactical error occurs
638 	 */
639 	void parseAttribute(AttributesWrapper attributes) 
640 	    throws IOException, SAXException;
641 
642 	/**
643 	 * Parses a comment or any declaration 
644 	 * starting with <code>&lt;!...</code> and notifying the handler. 
645 	 *
646 	 * @exception IOException 
647 	 *    if an io-error occurs
648 	 * @exception SAXException 
649 	 *    if a syntactical error occurs
650 	 */
651 	void parseCommentElemTypeDecl() throws IOException, SAXException;
652 
653 	/**
654 	 * Parses a processing instruction or any declaration 
655 	 * starting with <code>&lt;?...</code> and notifying the handler. 
656 	 *
657 	 * @exception IOException 
658 	 *    if an io-error occurs
659 	 * @exception SAXException 
660 	 *    if a syntactical error occurs
661 	 */
662 	void parseExtProcessingInstruction() throws IOException, SAXException;
663 
664     } // interface XML_SGML_Specifica 
665 
666     /**
667      * Contains the <code>HTML</code>-specific part of the parser. 
668      */
669     private final XMLsGMLspecifica htmlAttributeParser = 
670     new XMLsGMLspecifica() {
671 
672 	public void parseAttribute(AttributesWrapper attributes) 
673 	    throws IOException, SAXException {
674 	    String attName;
675 	    String attValue;
676 	    StringBuffer qName;
677 
678 	    // Parse attribute name 
679 	    qName = SGMLParser.this.buffer.
680 		readStringBuffer(TEST_BLANK_EQUALS_GT, ATTR_NAME);
681 	    qName.insert(0, (char) SGMLParser.this.currChar);
682 	    attName = qName.toString().toLowerCase(Locale.ENGLISH);
683 //System.out.println("attName: |"+attName+"|");
684 
685 	    // Here, the attribute may have a value or not. 
686 
687 	    // Skip whitespace either after having parsed the attribute 
688 	    // or between its name and its value. 
689 	    SGMLParser.this.currChar = 
690 		SGMLParser.this.buffer.readChar(); //NOPMD
691 	    if (Character.isWhitespace((char) SGMLParser.this.currChar)) {
692 		qName = SGMLParser.this.buffer.
693 		    readStringBuffer(TEST_NO_WHITESPACE, WHITESP_IN_ATTR);
694 		SGMLParser.this.currChar = 
695 		    SGMLParser.this.buffer.readChar(); //NOPMD
696 	    }
697 
698 	    // Here is the decision whether a value is provided or not. 
699 	    if (SGMLParser.this.currChar != SYMB_EQ) {
700 		// Here, no value may be given 
701 		attributes.addAttribute(attName, AttributesImpl.NO_VALUE);
702 //System.out.println("attName: |"+attName+"|");
703 //System.out.println("noValue@@"+(char)SGMLParser.this.currChar+"|");
704 		return;
705 	    }
706 	    // Here, clearly a value must follow 
707 
708 	    // Skip whitespaces 
709 	    qName = SGMLParser.this.buffer.
710 		readStringBuffer(TEST_NO_WHITESPACE, WHITESP_IN_ATTR);
711 	    SGMLParser.this.currChar = 
712 		SGMLParser.this.buffer.readChar(); //NOPMD
713 
714 	    // Parse the attribute value. 
715 	    switch (SGMLParser.this.currChar) {
716 		case '\'':
717 		    // fall through 
718 		case '"':
719 		    // the attribute value is quoted. 
720 		    char quote = (char) SGMLParser.this.currChar;
721 		    TEST_SPEC.setChar(quote);
722 		    //SGMLParser.this.currChar = 
723 		    //	SGMLParser.this.buffer.readChar();
724 
725 //System.out.println("quote@@"+SGMLParser.this.currChar);
726 		    qName = new StringBuffer();
727 		    while (true) {
728 			qName.append(SGMLParser.this.buffer.
729 				     readStringBuffer(TEST_SPEC, ATTR_VALUE));
730 			if (qName.length() != 0 
731 			    && qName.charAt(qName.length() - 1) == '\\') {
732 			    qName.setCharAt(qName.length() - 1, quote);
733 			} else {
734 			    // read the quote 
735 			    SGMLParser.this.currChar = //NOPMD
736 				SGMLParser.this.buffer.readChar();
737 			    break;
738 			}
739 		    }
740 		    break;
741 		default:
742 //System.out.println("no quote@@"+SGMLParser.this.currChar);
743 		    // the attribute value is not quoted. 
744 		    qName = SGMLParser.this.buffer.
745 			readStringBuffer(TEST_BLANK_GT, ATTR_VALUE);
746 		    qName.insert(0, (char) SGMLParser.this.currChar);
747 		    break;
748 	    }
749 	    // read the character after the attribute value 
750 	    SGMLParser.this.currChar = 
751 		SGMLParser.this.buffer.readChar(); //NOPMD
752 
753 	    attValue = qName.toString();
754 	    attributes.addAttribute(attName, attValue);
755 //System.out.println("attName: |"+attName+"|");
756 //System.out.println("attValue: |"+attValue+"|");
757 	}
758 
759 	public void parseCommentElemTypeDecl() 
760 	    throws IOException, SAXException {
761 //System.out.println("comment?");
762 
763 	    SGMLParser.this.currChar = //NOPMD
764 		SGMLParser.this.buffer.readChar();
765 	    if (SGMLParser.this.currChar != SYMB_COMMENT) {
766 		//int numRead = 
767 		SGMLParser.this.buffer.readArray(TEST_GT);
768 		SGMLParser.this.buffer.getStartAndMove();
769 		return;
770 	    }
771 	    // Here, object starts with "<!-....."
772 
773 	    SGMLParser.this.currChar = //NOPMD
774 		SGMLParser.this.buffer.readChar();
775 	    if (SGMLParser.this.currChar != SYMB_COMMENT) {
776 		throw new SAXParseException
777 		    ("Comments must start with \"<!--\" but found " 
778 		     + "\"<!-" + (char) SGMLParser.this.currChar + QUOTE_DOT, 
779 		     null);
780 	    }
781 //System.out.println("comment!");
782 
783 	    int numRead = 0;
784 	    do {
785 		numRead = SGMLParser.this.buffer
786 		    .readArray(TEST_END_OF_COMMENT);
787 		if (numRead == -1) {
788 		    StringBuffer qName = new StringBuffer();
789 		    qName.append(SGMLParser.this.buffer.getChars(),
790 				 SGMLParser.this.buffer.getStartAndMove(),
791 				 numRead);
792 		    throw new SAXParseException
793 			("End of stream while scanning comment. " 
794 			 + "Recently read: \"" + qName + QUOTE_DOT, 
795 			 null);
796 		}
797 
798 		SGMLParser.this.buffer.getStartAndMove();
799 	    } while (SGMLParser.this.buffer.isEmpty());
800 /*
801 	    StringBuffer qName = new StringBuffer();
802 	    qName.append(SGMLParser.this.buffer.getChars(),
803 			 SGMLParser.this.buffer.getStartAndMove(),
804 			 numRead);
805 
806 	    System.out.println("read so far: |"+qName+"|");
807 */
808 
809 	    SGMLParser.this.buffer.getStartAndMove();	    
810 	    // NO NOTIFY!!
811 	}
812 
813 	public void parseExtProcessingInstruction() 
814 	    throws IOException, SAXException {
815 	    parseStartOrStartEndTag();
816 	}
817     }; // htmlXML_SGML_Specifica
818 
819     /**
820      * Contains the <code>XML</code>-specific part of the parser. 
821      */
822     private final XMLsGMLspecifica xmlAttributeParser = 
823     new XMLsGMLspecifica() {
824 
825 	public void parseAttribute(AttributesWrapper attributes) 
826 	    throws IOException, SAXException {
827 	    String attName;
828 	    String attValue;
829 	    StringBuffer qName;
830 
831 	    // Parse attribute name 
832 	    qName = SGMLParser.this.buffer.
833 		readStringBuffer(TEST_BLANK_EQUALS_GT, ATTR_NAME);
834 	    qName.insert(0, (char) SGMLParser.this.currChar);
835 	    attName = qName.toString();
836 //System.out.println("attName: |"+attName+"|");
837 
838 	    // Here, the attribute may have a value or not. 
839 
840 	    // Skip whitespace either after having parsed the attribute 
841 	    // or between its name and its value. 
842 	    SGMLParser.this.currChar = 
843 		SGMLParser.this.buffer.readChar(); //NOPMD
844 	    if (Character.isWhitespace((char) SGMLParser.this.currChar)) {
845 		qName = SGMLParser.this.buffer.
846 		    readStringBuffer(TEST_NO_WHITESPACE, WHITESP_IN_ATTR);
847 		SGMLParser.this.currChar = 
848 		    SGMLParser.this.buffer.readChar(); //NOPMD
849 	    }
850 
851 	    // Here is the decision whether a value is provided or not. 
852 	    if (SGMLParser.this.currChar != SYMB_EQ) {
853 		// Here, a value is missing. 
854 		throw new SAXParseException
855 		    ("Missing value for attribute \"" 
856 		     + attName + QUOTE_DOT, null);
857 	    }
858 	    // Here, clearly a value must follow ****
859 
860 
861 	    // Skip whitespaces 
862 	    qName = SGMLParser.this.buffer.
863 		readStringBuffer(TEST_NO_WHITESPACE, WHITESP_IN_ATTR);
864 	    SGMLParser.this.currChar = 
865 		SGMLParser.this.buffer.readChar(); //NOPMD
866 
867 
868 	    // Parse the attribute value. 
869 	    switch (SGMLParser.this.currChar) {
870 		case '\'':
871 		    // fall through 
872 		case '"':
873 		    // the attribute value is quoted. 
874 		    char quote = (char) SGMLParser.this.currChar;
875 		    TEST_SPEC.setChar(quote);
876 		    //SGMLParser.this.currChar = 
877 		    //	SGMLParser.this.buffer.readChar();
878 
879 //System.out.println("quote@@"+SGMLParser.this.currChar);
880 		    qName = new StringBuffer();
881 		    while (true) {
882 			qName.append(SGMLParser.this.buffer.
883 				     readStringBuffer(TEST_SPEC, ATTR_VALUE));
884 			if (qName.length() != 0 
885 			    && qName.charAt(qName.length() - 1) == '\\') {
886 			    qName.setCharAt(qName.length() - 1, quote);
887 			} else {
888 			    // read the quote 
889 			    SGMLParser.this.currChar = //NOPMD
890 				SGMLParser.this.buffer.readChar();
891 			    break;
892 			}
893 		    }
894 		    break;
895 		default:
896 		    throw new SAXParseException
897 			("Value of attribute \"" + attName + 
898 			 "\" is not quoted. ",
899 			 null);
900 	    }
901 	    // read the character after the attribute value 
902 	    SGMLParser.this.currChar = 
903 		SGMLParser.this.buffer.readChar(); //NOPMD
904 
905 	    attValue = qName.toString();
906 	    attributes.addAttribute(attName, attValue);
907 //System.out.println("attName: |"+attName+"|");
908 //System.out.println("attValue: |"+attValue+"|");
909 	}
910 
911 	public void parseCommentElemTypeDecl() 
912 	    throws IOException, SAXException {
913 	    // ******** comments will not work that way!!!*****
914 
915 	    SGMLParser.this.buffer.readStringBuffer(TEST_GT, PROC_INSTR);
916 	    //**** comment
917 	    // Here, also the empty processing instruction or comment 
918 	    // would be possible. 
919 	
920 	    //this.buffer.getStart();
921 //System.out.println("-qName: |"+qName+"|");
922 	    SGMLParser.this.currChar = 
923 		SGMLParser.this.buffer.readChar(); //NOPMD
924 	    //assert this.currChar == '>';
925 	    SGMLParser.this.contentHandler.processingInstruction(null, null);
926 	}
927 
928 
929 	public void parseExtProcessingInstruction() 
930 	    throws IOException, SAXException {
931 
932 	    //StringBuffer qName = 
933 	    SGMLParser.this.buffer.readStringBuffer(TEST_GT, PROC_INSTR);
934 	    // Here, also the empty processing instruction would be possible. 
935 	
936 	    //this.buffer.getStart();
937 //System.out.println("-qName: |"+qName+"|");
938 	    SGMLParser.this.currChar = 
939 		SGMLParser.this.buffer.readChar(); //NOPMD
940 	    //assert this.currChar == '>';
941 	    SGMLParser.this.contentHandler.processingInstruction(null, null);
942 	}
943 
944     }; // xmlXML_SGML_Specifica
945 
946     /* --------------------------------------------------------------------- *
947      * class constants                                                       *
948      * --------------------------------------------------------------------- */
949 
950 
951     /**
952      * The size of the buffer used internally. 
953      * This must be at least <code>1</code>. 
954      * I found no significant difference in speed when increasing this number. 
955      * The buffer coming from a stream from a URL seems to hav maximal size 
956      * of <code>1448</code> whereas for file streams there seems no bound. 
957      * In the cases considered, the file is read in as a whole. 
958      */
959     private static final int BUFFER_SIZE = 999999;
960 
961     // for notification of a sax parse exception with Buffer.readStringBuffer. 
962     /**
963      * Short string representation of the object currently parsed. 
964      * Contains the specific part of the message of the exception 
965      * that may be thrown by {@link SGMLParser.Buffer#readStringBuffer}. 
966      */
967     private static final String START_TAG = "start tag";
968 
969     /**
970      * Short string representation of the object currently parsed. 
971      * Contains the specific part of the message of the exception 
972      * that may be thrown by {@link SGMLParser.Buffer#readStringBuffer}. 
973      */
974     private static final String END_TAG = "end tag";
975 
976     /**
977      * Short string representation of the object currently parsed. 
978      * Contains the specific part of the message of the exception 
979      * that may be thrown by {@link SGMLParser.Buffer#readStringBuffer}. 
980      */
981     private static final String PROC_INSTR = "processing instruction";
982 
983     /**
984      * Short string representation of the object currently parsed. 
985      * Contains the specific part of the message of the exception 
986      * that may be thrown by {@link SGMLParser.Buffer#readStringBuffer}. 
987      */
988     private static final String ATTR_NAME = "attribute name";
989 
990     /**
991      * Short string representation of the object currently parsed. 
992      * Contains the specific part of the message of the exception 
993      * that may be thrown by {@link SGMLParser.Buffer#readStringBuffer}. 
994      */
995     private static final String WHITESP_IN_ATTR = "whitespace in attribute";
996 
997     /**
998      * Short string representation of the object currently parsed. 
999      * Contains the specific part of the message of the exception 
1000      * that may be thrown by {@link SGMLParser.Buffer#readStringBuffer}. 
1001      */
1002     private static final String ATTR_VALUE = "attribute value";
1003 
1004     /* --------------------------------------------------------------------- *
1005      * fields                                                                *
1006      * --------------------------------------------------------------------- */
1007 
1008     /**
1009      * Contains class with methods specific for xml and sgml, respectively. 
1010      */
1011     private XMLsGMLspecifica xmlSgmlSpecifica = htmlAttributeParser;
1012 
1013     /**
1014      * The current character or <code>-1</code> 
1015      * to signfy the end of the stream. 
1016      */
1017     private int currChar;
1018 
1019     /**
1020      * The <code>ContentHandler</code> registered. 
1021      */
1022     private ContentHandler contentHandler;
1023 
1024     /**
1025      * The <code>ParseExceptionHandler</code> registered. 
1026      */
1027     private ParseExceptionHandler parseExceptionHandler;
1028 
1029     /**
1030      * The buffer of the input stream. 
1031      */
1032     private Buffer buffer;
1033 
1034     /* --------------------------------------------------------------------- *
1035      * constructors                                                          *
1036      * --------------------------------------------------------------------- */
1037 
1038     /**
1039      * Creates a new <code>SGMLParser</code> 
1040      * with the default handlers for content and exceptions. 
1041      */
1042     @SuppressWarnings("checkstyle:nowhitespaceafter")
1043     public SGMLParser() {
1044 	this.       contentHandler = new TrivialContentHandler();
1045 	this.parseExceptionHandler = new ParseExceptionHandler.Impl();
1046     }
1047 
1048     /* --------------------------------------------------------------------- *
1049      * methods                                                               *
1050      * --------------------------------------------------------------------- */
1051 
1052 
1053     /**
1054      * Parses the <code>InputSource</code> given 
1055      * but delegates everything inside a tag or a processing instruction 
1056      * to {@link #parseTagOrPI}. 
1057      *
1058      * @param src 
1059      *    an <code>InputSource</code>. 
1060      * @exception IOException if an error occurs
1061      * @exception SAXException if an error occurs
1062      */
1063     void parse(InputSource src) throws IOException, SAXException {
1064 	parse(src.getCharacterStream());
1065     }
1066 
1067     /**
1068      * Parses the given <code>InputStream</code>. 
1069      *
1070      * @param reader 
1071      *     an <code>Reader</code> sequentializing an SGML document. 
1072      * @exception IOException 
1073      *     if an error reading the stream occurs. 
1074      * @exception SAXException 
1075      *    if an error with the sgml-syntax occurs. 
1076      */
1077     public void parse(Reader reader) throws IOException, SAXException {
1078 
1079 	this.buffer = new Buffer(reader, BUFFER_SIZE);
1080 	int numRead = this.buffer.readArray(TEST_LT);
1081 	// notify handler that first part of document was successfully read. 
1082 	this.contentHandler.startDocument();
1083 	while (numRead != -1) {
1084 	    this.currChar = this.buffer.readChar(); // the '<' char? 
1085 	    if (this.currChar == SYMB_TAG) {
1086 		// a tag or a PI. 
1087 		numRead = parseTagOrPI();
1088 	    } else {
1089 		// either characters or ignoreableWhitespace
1090 		numRead = parseText();
1091 	    }
1092 	}
1093 	// Here, the document is finished. 
1094 	this.contentHandler.endDocument();
1095     }
1096 
1097     /**
1098      * Parses everything outside a tag, a processing instruction, ... 
1099      * everything within brackets <code>&lt;</code> and <code>&gt;</code>. 
1100      * ***** Missing: distinction between notification 
1101      * of characters and whitespace. ****
1102      *
1103      * @exception IOException 
1104      *     if an error reading the stream occurs. 
1105      * @exception SAXException 
1106      *    if an error with the sgml-syntax occurs. 
1107      * @see #parseTagOrPI
1108      */
1109     private int parseText() throws IOException, SAXException {
1110 	int numRead = this.buffer.readArray(TEST_LT);
1111 	if (numRead != -1) {
1112 /*
1113   System.out.println("text: |"+new String(buffer.getChars(),
1114   buffer.getStartAndMove(),
1115   numRead)+"|");
1116 */
1117 	    this.contentHandler.characters(this.buffer.getChars(),
1118 					   this.buffer.getStartAndMove(),
1119 					   numRead);
1120 	}
1121 	
1122 //buffer.getStartAndMove();
1123 	return numRead;
1124     }
1125 
1126     /**
1127      * Parses an end-tag notifying the underlying handler. 
1128      *
1129      * @exception IOException 
1130      *     if an error reading the stream occurs. 
1131      * @exception SAXException 
1132      *    if an error with the sgml-syntax occurs. 
1133      */
1134     void parseEndTag() throws IOException, SAXException {
1135 	StringBuffer qName = this.buffer.readStringBuffer(TEST_GT, END_TAG);
1136 	// Here, also the empty tag would be possible. 
1137 	
1138 	//this.buffer.getStart();
1139 //System.out.println("end tag: |"+qName+"|");
1140 	this.currChar = this.buffer.readChar();
1141 	//assert this.currChar == '>';
1142 	this.contentHandler.endElement(null,
1143 				       null,
1144 				       qName.toString());
1145 	this.currChar = this.buffer.readChar();
1146     }
1147 /*
1148   public void parseCommentElemTypeDecl() 
1149   throws IOException, SAXException {
1150   // ******** comments will not work that way!!!*****
1151 
1152   StringBuffer qName = this.buffer
1153   .readStringBuffer(TEST_GT, PROC_INSTR);//**** comment
1154   // Here, also the empty processing instruction or comment 
1155   // would be possible. 
1156 	
1157   //this.buffer.getStart();
1158 //System.out.println("-qName: |"+qName+"|");
1159 this.currChar = this.buffer.readChar();
1160 //assert this.currChar == '>';
1161 this.handler.processingInstruction(null, null);
1162 }
1163 */
1164 
1165     /**
1166      * Parses a start-tag or, for xml, an empty tag. 
1167      *
1168      * @exception IOException 
1169      *     if an error reading the stream occurs. 
1170      * @exception SAXException 
1171      *    if an error with the sgml-syntax occurs. 
1172      */
1173     void parseStartOrStartEndTag() throws IOException, SAXException {
1174 
1175 	// ***** Better read the name of the tag and 
1176 	// then single out problems with chars by a handler
1177 	if (!Character.isLetter((char) this.currChar)) {
1178 	    this.parseExceptionHandler
1179 		.foundIllegalCharInTag((char) this.currChar);
1180 	    // Ignore the previously read char. 
1181 	    this.currChar = this.buffer.readChar();
1182 	}
1183 
1184 
1185 	StringBuffer qName = this.buffer
1186 	    .readStringBuffer(TEST_BLANK_GT_SLASH, START_TAG);
1187 	qName.insert(0, (char) this.currChar);
1188 	// Here, also the empty tag would be possible. 
1189 //System.out.println("start tag: |"+qName+"|");
1190 
1191 	// Skip whitespaces 
1192 	this.currChar = this.buffer.readChar();
1193 	while (Character.isWhitespace((char) this.currChar)) {
1194 	    this.buffer.
1195 		readStringBuffer(TEST_NO_WHITESPACE, WHITESP_IN_ATTR);
1196 	    this.currChar = this.buffer.readChar();
1197 	}
1198 
1199 	AttributesWrapper attributesWrapper = new AttributesWrapper();
1200 	// Here, either /, > or an attribute occurs
1201 //System.out.println("this.currChar: |"+(char)this.currChar+"|");
1202 	while (this.currChar != '/' && this.currChar != '>') {
1203 	    // parse the following attribute list
1204 	    this.xmlSgmlSpecifica.parseAttribute(attributesWrapper);
1205 
1206 	    // Skip whitespaces 
1207 	    while (Character.isWhitespace((char) this.currChar)) {
1208 		this.buffer
1209 		    .readStringBuffer(TEST_NO_WHITESPACE, WHITESP_IN_ATTR);
1210 		this.currChar = this.buffer.readChar();
1211 	    }
1212 	} // end parsing attribute list 
1213 //System.out.println("-this.currChar: |"+(char)this.currChar+"|");
1214 
1215 
1216 	Attributes attributes = attributesWrapper.getAttributes();
1217 	switch (this.currChar) {
1218 	    case '/':
1219 		// start-end-tag called empty tag
1220 
1221 		// skip illegal characters between "/" and ">" **** 
1222 	skipped: while (true) { //NOPMD
1223 		    this.currChar = this.buffer.readChar();
1224 		    switch (this.currChar) {
1225 			case '>':
1226 			    break skipped;
1227 			case -1:
1228 			    this.parseExceptionHandler
1229 				.foundUnexpectedEndOfDocument();
1230 				break;
1231 			default:
1232 			    this.parseExceptionHandler
1233 				.foundCharAfterEndOfEndTag
1234 				((char) this.currChar);
1235 			    break;
1236 		    } // switch
1237 		}
1238 
1239 		this.contentHandler.startElement(null,
1240 						 null,
1241 						 qName.toString(),
1242 						 attributes);
1243 		this.contentHandler.endElement(null,
1244 					       null,
1245 					       qName.toString());
1246 		break;
1247 	    case '>':
1248 		this.contentHandler.startElement(null,
1249 						 null,
1250 						 qName.toString(),
1251 						 attributes);
1252 		break;
1253 	    default:
1254 		throw new SAXParseException
1255 		    ("Expected finishing tag \"" + qName 
1256 		     + "\" with character '/' or '>' " 
1257 		     + "but found '" + (char) this.currChar + "'. ", null);
1258 	}
1259     }
1260 
1261     /**
1262      * Parses everything within a tag, a processing instruction, ... 
1263      * everything within brackets <code>&lt;</code> and <code>&gt;</code>. 
1264      *
1265      * @see #parseText
1266      */
1267     private int parseTagOrPI() throws IOException, SAXException {
1268 //System.out.println("parseTagOrPI");
1269 	
1270 	this.currChar = this.buffer.readChar();
1271 	//this.currChar = this.reader.read();
1272 	switch (this.currChar) {
1273 	    case '/':
1274 		// parsing an end-tag 
1275 		parseEndTag();
1276 		//this.currChar = this.reader.read();
1277 		break;
1278 	    case '!':
1279 		// parsing no tag at all: 
1280 		// a processing instruction or a comment
1281 		this.xmlSgmlSpecifica.parseCommentElemTypeDecl();
1282 		//this.currChar = this.reader.read();
1283 		break;
1284 	    case '?':
1285 		// parsing no tag at all: 
1286 		// a processing instruction or a comment
1287 		this.xmlSgmlSpecifica.parseExtProcessingInstruction();
1288 		//this.currChar = this.reader.read();
1289 		break;
1290 	    default:
1291 		// parsing a start-tag or an empty-element-tag 
1292 		parseStartOrStartEndTag();
1293 		break;
1294 	} // end of switch ()
1295 	//this.currChar = this.buffer.readChar();
1296 	// Here, the buffer is ready 
1297 	// to read the first character. after the generalized tag. 
1298 
1299 //System.out.println("read last: |"+(char)this.currChar+"|");
1300 //System.out.println("read last: |"+      this.currChar+"|");
1301 
1302 	return 1;
1303     }
1304 
1305     /**
1306      * Sets {@link #contentHandler}. 
1307      *
1308      * @param contentHandler 
1309      *    a <code>ContentHandler</code>. 
1310      */
1311     public void setContentHandler(ContentHandler contentHandler) {
1312 	if (isXMLParser()) {
1313 	    this.contentHandler = contentHandler;
1314 	} else {
1315 	    this.contentHandler = new SGMLFilter(contentHandler);
1316 	}
1317     }
1318 
1319     /**
1320      * Returns {@link #contentHandler}. 
1321      *
1322      * @return
1323      *    the <code>ContentHandler</code> {@link #contentHandler}. 
1324      */
1325     public ContentHandler getContentHandler() {
1326 	if (isXMLParser()) {
1327 	    return this.contentHandler;
1328 	} else {
1329 	    return ((SGMLFilter) this.contentHandler).getWrapped();
1330 	}
1331     }
1332 
1333     /**
1334      * Sets {@link #parseExceptionHandler}. 
1335      *
1336      * @param peHandler 
1337      *    a <code>ParseExceptionHandler</code>. 
1338      */
1339     public void setExceptionHandler(ParseExceptionHandler peHandler) {
1340 	this.parseExceptionHandler = peHandler;
1341     }
1342 
1343     /**
1344      * Returns {@link #parseExceptionHandler}. 
1345      *
1346      * @return
1347      *    the <code>ContentHandler</code> {@link #parseExceptionHandler}. 
1348      */
1349     public ParseExceptionHandler getExceptionHandler() {
1350 	return this.parseExceptionHandler;
1351     }
1352 
1353     /**
1354      * Sets whether this parser is used as an xml-parser. 
1355      * If this is false, which is the default, 
1356      * it s an html-parser. 
1357      *
1358      * @param xml 
1359      *    a <code>boolean</code> value signifying 
1360      *    whether this parser will be used as an xml-parser in the sequel. 
1361      * @return 
1362      *    a <code>boolean</code> value signifying 
1363      *    whether before invoking this method 
1364      *    this parser was used as an xml-parser
1365      */
1366     public boolean parseXML(boolean xml) {
1367 	boolean result = this.xmlSgmlSpecifica == xmlAttributeParser;
1368 	this.xmlSgmlSpecifica = xml 
1369 	    ? xmlAttributeParser 
1370 	    : htmlAttributeParser;
1371 	return result;
1372     }
1373 
1374     public boolean isXMLParser() {
1375 	return this.xmlSgmlSpecifica == xmlAttributeParser;
1376     }
1377 }