1
2
3 package eu.simuline.util.sgml;
4
5 import eu.simuline.util.ListMap;
6
7 import java.io.Reader;
8 import java.io.IOException;
9
10 import org.xml.sax.ContentHandler;
11 import org.xml.sax.Locator;
12 import org.xml.sax.Attributes;
13 import org.xml.sax.InputSource;
14 import org.xml.sax.SAXException;
15 import org.xml.sax.SAXParseException;
16 import java.util.Locale;
17
18 /**
19 * A rudimentary <code>SGML</code> parser with something like a SAX-api.
20 *
21 * @author <a href="mailto:ernst.reissner@simuline.eu">Ernst Reissner</a>
22 * @version 1.0
23 */
24 public final class SGMLParser {
25
26 private static final String QUOTE_DOT = "\". ";
27 private static final char SYMB_EQ = '=';
28 private static final char SYMB_COMMENT = '-';
29 private static final char SYMB_TAG = '<';
30
31 /* --------------------------------------------------------------------- *
32 * inner classes *
33 * --------------------------------------------------------------------- */
34
35 /**
36 * A <code>ContentHandler</code> which simply ignores all events.
37 * May be used for debugging.
38 */
39 static class TrivialContentHandler implements ContentHandler {
40
41 /** <!-- api-docs inherited from interface implemented. -->*/
42 public void setDocumentLocator(Locator locator) {
43 // is empty.
44 }
45
46 public void startDocument() throws SAXException {
47 // is empty.
48 }
49
50 public void endDocument() throws SAXException {
51 // is empty.
52 }
53
54 public void startPrefixMapping(String prefix,
55 String uri)
56 throws SAXException {
57 // is empty.
58 }
59
60 public void endPrefixMapping(String prefix)
61 throws SAXException {
62 // is empty.
63 }
64
65 public void startElement(String namespaceURI,
66 String localName,
67 String qName,
68 Attributes atts)
69 throws SAXException {
70 // is empty.
71 }
72
73 public void endElement(String namespaceURI,
74 String localName,
75 String qName)
76 throws SAXException {
77 // is empty.
78 }
79
80 public void characters(char[] chr,
81 int start,
82 int length)
83 throws SAXException {
84 // is empty.
85 }
86
87 public void ignorableWhitespace(char[] chr,
88 int start,
89 int length)
90 throws SAXException {
91 // is empty.
92 }
93
94 public void processingInstruction(String target,
95 String data)
96 throws SAXException {
97 // is empty.
98 }
99
100 public void skippedEntity(String name)
101 throws SAXException {
102 // is empty.
103 }
104 } // class TrivialContentHandler
105
106
107 /**
108 * An **** partial **** implementation
109 * of the SAX-interface <code>Attributes</code>
110 * which allows to set name-value-pairs by method {@link #addAttribute}.
111 */
112 class AttributesWrapper {
113
114 /* ----------------------------------------------------------------- *
115 * fields *
116 * ----------------------------------------------------------------- */
117
118 /**
119 * See {@link AttributesImpl#name2value}.
120 */
121 private final ListMap<String, String> name2value;
122
123 /* ----------------------------------------------------------------- *
124 * constructors *
125 * ----------------------------------------------------------------- */
126
127 /**
128 * Creates a new empty <code>AttributesWrapper</code>
129 * which represents an empty attribute list.
130 */
131 AttributesWrapper() {
132 this.name2value = new ListMap<String, String>();
133 }
134
135 /* ----------------------------------------------------------------- *
136 * methods *
137 * ----------------------------------------------------------------- */
138
139
140 /**
141 * Adds an attribute with the given name and value.
142 *
143 * @param attName
144 * the <code>String</code> representation
145 * of the name of an attribute.
146 * @param attValue
147 * the value of an attribute as a <code>String</code>.
148 * If no value is provided,
149 * this is {@link AttributesImpl#NO_VALUE}.
150 */
151 void addAttribute(String attName, String attValue) {
152 String oldAttValue = this.name2value
153 .put(attName, attValue);
154 if (oldAttValue != null) {
155 // Here, the attribute has occured before.
156 SGMLParser.this.parseExceptionHandler
157 .foundMultipleAttribute(attName,
158 oldAttValue);
159 }
160 }
161
162 Attributes getAttributes() {
163 return new AttributesImpl(this.name2value);
164 }
165 } // class AttributesWrapper
166
167
168 /**
169 * Provides a single method which decides whether the given character
170 * passes a certain test.
171 */
172 interface CharTester {
173
174 /**
175 * Returns whether the given character <code>ch</code>
176 * passes the test given by this <code>CharTester</code>.
177 *
178 * @param chr
179 * an <code>int</code> value representing a character.
180 * @return the <code>boolean</code>
181 * signifying whether the given character <code>ch</code>
182 * passes the test given by this <code>CharTester</code>.
183 */
184 boolean testChar(char chr);
185
186 } // interface CharTester
187
188 /**
189 * Tests for blank, <code>/</code>, <code>></code>.
190 */
191 private static final CharTester TEST_BLANK_GT_SLASH = new CharTester() {
192 public boolean testChar(char chr) {
193 return Character.isWhitespace(chr)
194 || chr == '/'
195 || chr == '>';
196 }
197 };
198
199 /**
200 * Tests for blank or <code>></code>.
201 */
202 private static final CharTester TEST_BLANK_GT = new CharTester() {
203 public boolean testChar(char chr) {
204 return Character.isWhitespace(chr)
205 || chr == '>';
206 }
207 };
208
209 /*
210 * Tests for <code>/</code> or <code>></code>.
211 */
212 /*
213 private static final CharTester TEST_GT_SLASH = new CharTester() {
214 public boolean testChar(char ch) {
215 return ch == '/'
216 || ch == '>';
217 }
218 };
219 */
220
221 /**
222 * Tests for <code><</code>.
223 */
224 private static final CharTester TEST_LT = new CharTester() {
225 public boolean testChar(char chr) {
226 return chr == '<';
227 }
228 };
229
230 /**
231 * Tests for <code>></code>.
232 */
233 private static final CharTester TEST_GT = new CharTester() {
234 public boolean testChar(char chr) {
235 return chr == '>';
236 }
237 };
238
239 /**
240 * Tests for <code>=</code> and for <code>></code>.
241 */
242 private static final CharTester TEST_BLANK_EQUALS_GT = new CharTester() {
243 public boolean testChar(char chr) {
244 return Character.isWhitespace(chr)
245 || chr == '='
246 || chr == '>';
247 }
248 };
249
250 /**
251 * Tests for whitespace.
252 */
253 private static final CharTester TEST_NO_WHITESPACE = new CharTester() {
254 public boolean testChar(char chr) {
255 return !Character.isWhitespace(chr);
256 }
257 };
258
259 /**
260 * Tests for quote both for<code>'</code> and for <code>"</code>.
261 */
262 /*
263 private static final CharTester TEST_QUOTE = new CharTester() {
264 public boolean testChar(char chr) {
265 return chr == '\''
266 || chr == '"';
267 }
268 };
269 */
270
271 /*
272 * Tests for end of comment <code>--></code>.
273 * This tests for a sequence of characters
274 * and confirms after having read the last one.
275 */
276 private static final CharTester TEST_END_OF_COMMENT = new CharTester() {
277
278 /**
279 * Contains the sequence <code>--></code>
280 * representing the end of a comment.
281 */
282 static final String END_OF_COMMENT = "-->";
283
284 /**
285 * Contains the index in {@link #END_OF_COMMENT}
286 * which is to be compared next by {@link #testChar}.
287 */
288 private int index = 0;
289
290 /**
291 * Returns whether the last characters tested
292 * are <code>--></code>.
293 *
294 * @param chr
295 * a <code>char</code>.
296 * @return
297 * whether the last characters tested
298 * including <code>char</code> are <code>--></code>.
299 * In particular, if less than three characters are read
300 * this is <code>false</code>.
301 */
302 public boolean testChar(char chr) {
303 if (END_OF_COMMENT.charAt(index++) == chr) {
304 if (this.index == END_OF_COMMENT.length() - 1) {
305 this.index = 0;
306 return true;
307 } else {
308 return false;
309 }
310 } else {
311 this.index = 0;
312 return false;
313 }
314 }
315 }; // TEST_END_OF_COMMENT
316
317 /**
318 * A <code>CharTester</code> which allows to specify
319 * the character which passes the test.
320 */
321 static class SpecCharTester implements CharTester {
322
323 /**
324 * The character which passes the test {@link #testChar}.
325 */
326 private char chr;
327
328 /**
329 * Sets {@link #chr} to the specified character value.
330 *
331 * @param chr
332 * a <code>char</code> value.
333 */
334 void setChar(char chr) {
335 this.chr = chr;
336 }
337
338 /**
339 * Returns whether the given character coincides with {@link #chr}.
340 *
341 * @param chr
342 * a <code>char</code> value.
343 * @return
344 * whether <code>ch</code> coincides with {@link #chr}.
345 */
346 public boolean testChar(char chr) {
347 return chr == this.chr;
348 }
349 } // SpecCharTester
350
351 /**
352 * Tests for a specified character.
353 * This is used for quotes which allow the cases
354 * <code>'</code> and <code>"</code>.
355 *
356 * @see XMLsGMLspecifica#parseAttribute
357 */
358 private static final SpecCharTester TEST_SPEC = new SpecCharTester();
359
360 /**
361 * Class which buffers the read stream.
362 */
363 static class Buffer {
364
365 /* ----------------------------------------------------------------- *
366 * fields *
367 * ----------------------------------------------------------------- */
368
369 /**
370 * The reader buffered.
371 */
372 private final Reader reader;
373
374 /**
375 * The current buffer.
376 * The current parts to be read start with
377 * <code>bufferArray[{@link #start}]</code> and end with
378 * <code>bufferArray[{@link #end}]</code>, exclusively.
379 */
380 private final char[] bufferArray;
381
382 /**
383 * The first index in {@link #bufferArray}
384 * read in from {@link #reader} but not returned
385 * by {@link #readArray} or {@link #readChar}.
386 */
387 private int start;
388
389 /**
390 * Set by {@link #readArray} and read by {@link #getStartAndMove}.
391 * When invoking {@link #readArray} <code>newStart</code>
392 * is set to {@link #start} and increased
393 * by the number of read charactersincreases.
394 * Then {@link #getStartAndMove} updates {@link #start}
395 * according to <code>newStart</code>.
396 */
397 private int newStart;
398
399 /**
400 * The first index in {@link #bufferArray} not read
401 * from {@link #reader}
402 * or <code>-1</code> if the end of the stream is reached.
403 * This means that <code>bufferArray[end]</code>
404 * either does not exist or at least is not significant.
405 */
406 private int end;
407
408 /* ----------------------------------------------------------------- *
409 * constructors *
410 * ----------------------------------------------------------------- */
411
412 /**
413 * Creates a new <code>Buffer</code> from the given reader
414 * with the given size.
415 *
416 * @param reader
417 * the <code>Reader</code> to be buffered.
418 * @param length
419 * the length of the buffer.
420 * @exception IOException
421 * if an error occurs
422 */
423 Buffer(Reader reader, int length) throws IOException {
424 this.reader = reader;
425 this.bufferArray = new char[length];
426 this.start = 0;
427 this.end = this.start; // signifies: reading necessary.
428 }
429
430 /* ----------------------------------------------------------------- *
431 * methods *
432 * ----------------------------------------------------------------- */
433
434
435 /**
436 * Returns whether this buffer is currently empty.
437 * When this is the case and someone tries to read further characters
438 * this will lead to a trial
439 * to read further pieces from {@link #reader}.
440 *
441 * @return a <code>boolean</code> value
442 * signifying whether this buffer is currently empty.
443 */
444 boolean isEmpty() {
445 return this.end == this.start;
446 }
447
448 /**
449 * Returns whether the end of the stream is reached.
450 *
451 * @return
452 * a <code>boolean</code> specifying
453 * whether the end of the stream is reached.
454 */
455 boolean reachedEOS() {
456 return this.end == -1;
457 }
458
459 /**
460 * Reads a single <code>char</code> and returns it.
461 *
462 * @return
463 * an <code>int</code> value
464 * which is either the next <code>char</code> read in
465 * or <code>-1</code> which signifies the end of the stream.
466 * @exception IOException
467 * if an error occurs
468 */
469 int readChar() throws IOException {
470 if (reachedEOS()) {
471 return -1;
472 }
473 if (isEmpty()) {
474 this.start = 0;
475 this.end = this.reader.read(this.bufferArray);
476 if (reachedEOS()) {
477 return -1;
478 }
479 }
480 return this.bufferArray[this.start++];
481 }
482
483 /**
484 * Reads an array from {@link #reader}.
485 * As a side effect, writes the field {@link #newStart}.
486 * Also, if the portion of {@link #bufferArray}
487 * to be read, i.e. between {@link #start} and {@link #end},
488 * is empty, a new portion is buffered.
489 *
490 * @param charTester
491 * a <code>CharTester</code> which signifies
492 * when to end reading from the buffer.
493 * @return
494 * an <code>int</code> signifying the number of <code>char</code>s
495 * read or <code>-1</code> which signifies the end of the stream.
496 * It is read to the next < or, if there is none,
497 * to the end of the stream.
498 * Thus there is a difference between the return values
499 * <code>-1</code> and <code>0</code>.
500 * @exception IOException
501 * if an error occurs
502 */
503 int readArray(CharTester charTester) throws IOException {
504 if (reachedEOS()) {
505 return -1;
506 }
507 if (isEmpty()) {
508 this.start = 0;
509 this.end = this.reader.read(this.bufferArray);
510 //System.out.println("read: "+this.end);
511 if (reachedEOS()) {
512 return -1;
513 }
514 }
515
516 for (int i = this.start; i < this.end; i++) {
517 if (charTester.testChar(this.bufferArray[i])) {
518 // found match described by charTester
519 this.newStart = i;
520 return this.newStart - this.start;
521 }
522 }
523 // Here, the test always failed.
524 this.newStart = this.end;
525 return this.end - this.start;
526 }
527
528 /**
529 * Describe <code>readStringBuffer</code> method here.
530 *
531 * @param charTester
532 * a <code>CharTester</code> which determines
533 * the first character not read
534 * into the resulting <code>StringBuffer</code>.
535 * @param elementName
536 * a <code>String</code> which determines
537 * the element under consideration.
538 * This is only used for generating the message of a
539 * <code>SAXParseException</code>.
540 * <p>
541 * Allowed values: {@link #START_TAG}, {@link #END_TAG},
542 * {@link #PROC_INSTR},
543 * {@link #ATTR_NAME}, {@link #WHITESP_IN_ATTR}
544 * and {@link #ATTR_VALUE}. ****** comment and <!element missing.
545 * @return
546 * a <code>StringBuffer</code> containing characters
547 * starting with the current one until one
548 * <code>charTester</code> returns <code>true</code>.
549 * @exception IOException
550 * if an io-error occurs
551 * @exception SAXParseException
552 * if the parser faces the end of the stream
553 * while scanning the current element.
554 */
555 StringBuffer readStringBuffer(CharTester charTester,
556 String elementName)
557 throws IOException, SAXParseException {
558
559 StringBuffer qName = new StringBuffer();
560 int numRead = 0;
561 do {
562 numRead = readArray(charTester);
563 if (numRead == -1) {
564 throw new SAXParseException
565 ("End of stream while scanning "
566 + elementName + ". "
567 + "Read so far: \""
568 + qName + QUOTE_DOT, null);
569 }
570 qName.append(getChars(),
571 getStartAndMove(),
572 numRead);
573 } while (isEmpty());
574
575 return qName;
576 }
577
578 /**
579 * Returns the buffer of <code>char</code>s.
580 *
581 * @return
582 * the <code>char[]</code> {@link #bufferArray}.
583 */
584 char[] getChars() {
585 return this.bufferArray;
586 }
587
588 /**
589 * Moves {@link #newStart} to {@link #start}
590 * and returns the old value of {@link #start}.
591 *
592 * @return
593 * the old <code>int</code> value of {@link #start}.
594 */
595 int getStartAndMove() {
596 int ret = this.start;
597 this.start = this.newStart;
598 return ret;
599 }
600
601 /**
602 * Get method for {@link #start}.
603 *
604 * @return {@link #start}
605 */
606 int getStart() {
607 return this.start;
608 }
609
610 /**
611 * Get method for {@link #end}.
612 *
613 * @return {@link #end}
614 */
615 int getEnd() {
616 return this.end;
617 }
618 } // class Buffer
619
620 /**
621 * Provides a bunch of methods fpr parsing
622 * with implementations specific to xml and sgml.
623 */
624 interface XMLsGMLspecifica {
625 // **** SGMLParser.this.currChar must be the character
626 // after the attribute list.
627 /**
628 * Parses one attribute and adds it to the given attribute list.
629 *
630 * @param attributes
631 * an <code>AttributesImpl</code>
632 * to which the attribute parsed is added.
633 * @exception IOException
634 * if an io-error occurs
635 * @exception SAXException
636 * if a syntactical error occurs
637 */
638 void parseAttribute(AttributesWrapper attributes)
639 throws IOException, SAXException;
640
641 /**
642 * Parses a comment or any declaration
643 * starting with <code><!...</code> and notifying the handler.
644 *
645 * @exception IOException
646 * if an io-error occurs
647 * @exception SAXException
648 * if a syntactical error occurs
649 */
650 void parseCommentElemTypeDecl() throws IOException, SAXException;
651
652 /**
653 * Parses a processing instruction or any declaration
654 * starting with <code><?...</code> and notifying the handler.
655 *
656 * @exception IOException
657 * if an io-error occurs
658 * @exception SAXException
659 * if a syntactical error occurs
660 */
661 void parseExtProcessingInstruction() throws IOException, SAXException;
662
663 } // interface XML_SGML_Specifica
664
665 /**
666 * Contains the <code>HTML</code>-specific part of the parser.
667 */
668 private final XMLsGMLspecifica htmlAttributeParser =
669 new XMLsGMLspecifica() {
670
671 public void parseAttribute(AttributesWrapper attributes)
672 throws IOException, SAXException {
673 String attName;
674 String attValue;
675 StringBuffer qName;
676
677 // Parse attribute name
678 qName = SGMLParser.this.buffer.
679 readStringBuffer(TEST_BLANK_EQUALS_GT, ATTR_NAME);
680 qName.insert(0, (char) SGMLParser.this.currChar);
681 attName = qName.toString().toLowerCase(Locale.ENGLISH);
682 //System.out.println("attName: |"+attName+"|");
683
684 // Here, the attribute may have a value or not.
685
686 // Skip whitespace either after having parsed the attribute
687 // or between its name and its value.
688 SGMLParser.this.currChar =
689 SGMLParser.this.buffer.readChar(); //NOPMD
690 if (Character.isWhitespace((char) SGMLParser.this.currChar)) {
691 qName = SGMLParser.this.buffer.
692 readStringBuffer(TEST_NO_WHITESPACE, WHITESP_IN_ATTR);
693 SGMLParser.this.currChar =
694 SGMLParser.this.buffer.readChar(); //NOPMD
695 }
696
697 // Here is the decision whether a value is provided or not.
698 if (SGMLParser.this.currChar != SYMB_EQ) {
699 // Here, no value may be given
700 attributes.addAttribute(attName, AttributesImpl.NO_VALUE);
701 //System.out.println("attName: |"+attName+"|");
702 //System.out.println("noValue@@"+(char)SGMLParser.this.currChar+"|");
703 return;
704 }
705 // Here, clearly a value must follow
706
707 // Skip whitespaces
708 qName = SGMLParser.this.buffer.
709 readStringBuffer(TEST_NO_WHITESPACE, WHITESP_IN_ATTR);
710 SGMLParser.this.currChar =
711 SGMLParser.this.buffer.readChar(); //NOPMD
712
713 // Parse the attribute value.
714 switch (SGMLParser.this.currChar) {
715 case '\'':
716 // fall through
717 case '"':
718 // the attribute value is quoted.
719 char quote = (char) SGMLParser.this.currChar;
720 TEST_SPEC.setChar(quote);
721 //SGMLParser.this.currChar =
722 // SGMLParser.this.buffer.readChar();
723
724 //System.out.println("quote@@"+SGMLParser.this.currChar);
725 qName = new StringBuffer();
726 while (true) {
727 qName.append(SGMLParser.this.buffer.
728 readStringBuffer(TEST_SPEC, ATTR_VALUE));
729 if (qName.length() != 0
730 && qName.charAt(qName.length() - 1) == '\\') {
731 qName.setCharAt(qName.length() - 1, quote);
732 } else {
733 // read the quote
734 SGMLParser.this.currChar = //NOPMD
735 SGMLParser.this.buffer.readChar();
736 break;
737 }
738 }
739 break;
740 default:
741 //System.out.println("no quote@@"+SGMLParser.this.currChar);
742 // the attribute value is not quoted.
743 qName = SGMLParser.this.buffer.
744 readStringBuffer(TEST_BLANK_GT, ATTR_VALUE);
745 qName.insert(0, (char) SGMLParser.this.currChar);
746 break;
747 }
748 // read the character after the attribute value
749 SGMLParser.this.currChar =
750 SGMLParser.this.buffer.readChar(); //NOPMD
751
752 attValue = qName.toString();
753 attributes.addAttribute(attName, attValue);
754 //System.out.println("attName: |"+attName+"|");
755 //System.out.println("attValue: |"+attValue+"|");
756 }
757
758 public void parseCommentElemTypeDecl()
759 throws IOException, SAXException {
760 //System.out.println("comment?");
761
762 SGMLParser.this.currChar = //NOPMD
763 SGMLParser.this.buffer.readChar();
764 if (SGMLParser.this.currChar != SYMB_COMMENT) {
765 //int numRead =
766 SGMLParser.this.buffer.readArray(TEST_GT);
767 SGMLParser.this.buffer.getStartAndMove();
768 return;
769 }
770 // Here, object starts with "<!-....."
771
772 SGMLParser.this.currChar = //NOPMD
773 SGMLParser.this.buffer.readChar();
774 if (SGMLParser.this.currChar != SYMB_COMMENT) {
775 throw new SAXParseException
776 ("Comments must start with \"<!--\" but found "
777 + "\"<!-" + (char) SGMLParser.this.currChar + QUOTE_DOT,
778 null);
779 }
780 //System.out.println("comment!");
781
782 int numRead = 0;
783 do {
784 numRead = SGMLParser.this.buffer
785 .readArray(TEST_END_OF_COMMENT);
786 if (numRead == -1) {
787 StringBuffer qName = new StringBuffer();
788 qName.append(SGMLParser.this.buffer.getChars(),
789 SGMLParser.this.buffer.getStartAndMove(),
790 numRead);
791 throw new SAXParseException
792 ("End of stream while scanning comment. "
793 + "Recently read: \"" + qName + QUOTE_DOT,
794 null);
795 }
796
797 SGMLParser.this.buffer.getStartAndMove();
798 } while (SGMLParser.this.buffer.isEmpty());
799 /*
800 StringBuffer qName = new StringBuffer();
801 qName.append(SGMLParser.this.buffer.getChars(),
802 SGMLParser.this.buffer.getStartAndMove(),
803 numRead);
804
805 System.out.println("read so far: |"+qName+"|");
806 */
807
808 SGMLParser.this.buffer.getStartAndMove();
809 // NO NOTIFY!!
810 }
811
812 public void parseExtProcessingInstruction()
813 throws IOException, SAXException {
814 parseStartOrStartEndTag();
815 }
816 }; // htmlXML_SGML_Specifica
817
818 /**
819 * Contains the <code>XML</code>-specific part of the parser.
820 */
821 private final XMLsGMLspecifica xmlAttributeParser =
822 new XMLsGMLspecifica() {
823
824 public void parseAttribute(AttributesWrapper attributes)
825 throws IOException, SAXException {
826 String attName;
827 String attValue;
828 StringBuffer qName;
829
830 // Parse attribute name
831 qName = SGMLParser.this.buffer.
832 readStringBuffer(TEST_BLANK_EQUALS_GT, ATTR_NAME);
833 qName.insert(0, (char) SGMLParser.this.currChar);
834 attName = qName.toString();
835 //System.out.println("attName: |"+attName+"|");
836
837 // Here, the attribute may have a value or not.
838
839 // Skip whitespace either after having parsed the attribute
840 // or between its name and its value.
841 SGMLParser.this.currChar =
842 SGMLParser.this.buffer.readChar(); //NOPMD
843 if (Character.isWhitespace((char) SGMLParser.this.currChar)) {
844 qName = SGMLParser.this.buffer.
845 readStringBuffer(TEST_NO_WHITESPACE, WHITESP_IN_ATTR);
846 SGMLParser.this.currChar =
847 SGMLParser.this.buffer.readChar(); //NOPMD
848 }
849
850 // Here is the decision whether a value is provided or not.
851 if (SGMLParser.this.currChar != SYMB_EQ) {
852 // Here, a value is missing.
853 throw new SAXParseException
854 ("Missing value for attribute \""
855 + attName + QUOTE_DOT, null);
856 }
857 // Here, clearly a value must follow ****
858
859
860 // Skip whitespaces
861 qName = SGMLParser.this.buffer.
862 readStringBuffer(TEST_NO_WHITESPACE, WHITESP_IN_ATTR);
863 SGMLParser.this.currChar =
864 SGMLParser.this.buffer.readChar(); //NOPMD
865
866
867 // Parse the attribute value.
868 switch (SGMLParser.this.currChar) {
869 case '\'':
870 // fall through
871 case '"':
872 // the attribute value is quoted.
873 char quote = (char) SGMLParser.this.currChar;
874 TEST_SPEC.setChar(quote);
875 //SGMLParser.this.currChar =
876 // SGMLParser.this.buffer.readChar();
877
878 //System.out.println("quote@@"+SGMLParser.this.currChar);
879 qName = new StringBuffer();
880 while (true) {
881 qName.append(SGMLParser.this.buffer.
882 readStringBuffer(TEST_SPEC, ATTR_VALUE));
883 if (qName.length() != 0
884 && qName.charAt(qName.length() - 1) == '\\') {
885 qName.setCharAt(qName.length() - 1, quote);
886 } else {
887 // read the quote
888 SGMLParser.this.currChar = //NOPMD
889 SGMLParser.this.buffer.readChar();
890 break;
891 }
892 }
893 break;
894 default:
895 throw new SAXParseException
896 ("Value of attribute \"" + attName +
897 "\" is not quoted. ",
898 null);
899 }
900 // read the character after the attribute value
901 SGMLParser.this.currChar =
902 SGMLParser.this.buffer.readChar(); //NOPMD
903
904 attValue = qName.toString();
905 attributes.addAttribute(attName, attValue);
906 //System.out.println("attName: |"+attName+"|");
907 //System.out.println("attValue: |"+attValue+"|");
908 }
909
910 public void parseCommentElemTypeDecl()
911 throws IOException, SAXException {
912 // ******** comments will not work that way!!!*****
913
914 SGMLParser.this.buffer.readStringBuffer(TEST_GT, PROC_INSTR);
915 //**** comment
916 // Here, also the empty processing instruction or comment
917 // would be possible.
918
919 //this.buffer.getStart();
920 //System.out.println("-qName: |"+qName+"|");
921 SGMLParser.this.currChar =
922 SGMLParser.this.buffer.readChar(); //NOPMD
923 //assert this.currChar == '>';
924 SGMLParser.this.contentHandler.processingInstruction(null, null);
925 }
926
927
928 public void parseExtProcessingInstruction()
929 throws IOException, SAXException {
930
931 //StringBuffer qName =
932 SGMLParser.this.buffer.readStringBuffer(TEST_GT, PROC_INSTR);
933 // Here, also the empty processing instruction would be possible.
934
935 //this.buffer.getStart();
936 //System.out.println("-qName: |"+qName+"|");
937 SGMLParser.this.currChar =
938 SGMLParser.this.buffer.readChar(); //NOPMD
939 //assert this.currChar == '>';
940 SGMLParser.this.contentHandler.processingInstruction(null, null);
941 }
942
943 }; // xmlXML_SGML_Specifica
944
945 /* --------------------------------------------------------------------- *
946 * class constants *
947 * --------------------------------------------------------------------- */
948
949
950 /**
951 * The size of the buffer used internally.
952 * This must be at least <code>1</code>.
953 * I found no significant difference in speed when increasing this number.
954 * The buffer coming from a stream from a URL seems to hav maximal size
955 * of <code>1448</code> whereas for file streams there seems no bound.
956 * In the cases considered, the file is read in as a whole.
957 */
958 private static final int BUFFER_SIZE = 999999;
959
960 // for notification of a sax parse exception with Buffer.readStringBuffer.
961 /**
962 * Short string representation of the object currently parsed.
963 * Contains the specific part of the message of the exception
964 * that may be thrown by {@link SGMLParser.Buffer#readStringBuffer}.
965 */
966 private static final String START_TAG = "start tag";
967
968 /**
969 * Short string representation of the object currently parsed.
970 * Contains the specific part of the message of the exception
971 * that may be thrown by {@link SGMLParser.Buffer#readStringBuffer}.
972 */
973 private static final String END_TAG = "end tag";
974
975 /**
976 * Short string representation of the object currently parsed.
977 * Contains the specific part of the message of the exception
978 * that may be thrown by {@link SGMLParser.Buffer#readStringBuffer}.
979 */
980 private static final String PROC_INSTR = "processing instruction";
981
982 /**
983 * Short string representation of the object currently parsed.
984 * Contains the specific part of the message of the exception
985 * that may be thrown by {@link SGMLParser.Buffer#readStringBuffer}.
986 */
987 private static final String ATTR_NAME = "attribute name";
988
989 /**
990 * Short string representation of the object currently parsed.
991 * Contains the specific part of the message of the exception
992 * that may be thrown by {@link SGMLParser.Buffer#readStringBuffer}.
993 */
994 private static final String WHITESP_IN_ATTR = "whitespace in attribute";
995
996 /**
997 * Short string representation of the object currently parsed.
998 * Contains the specific part of the message of the exception
999 * that may be thrown by {@link SGMLParser.Buffer#readStringBuffer}.
1000 */
1001 private static final String ATTR_VALUE = "attribute value";
1002
1003 /* --------------------------------------------------------------------- *
1004 * fields *
1005 * --------------------------------------------------------------------- */
1006
1007 /**
1008 * Contains class with methods specific for xml and sgml, respectively.
1009 */
1010 private XMLsGMLspecifica xmlSgmlSpecifica = htmlAttributeParser;
1011
1012 /**
1013 * The current character or <code>-1</code>
1014 * to signfy the end of the stream.
1015 */
1016 private int currChar;
1017
1018 /**
1019 * The <code>ContentHandler</code> registered.
1020 */
1021 private ContentHandler contentHandler;
1022
1023 /**
1024 * The <code>ParseExceptionHandler</code> registered.
1025 */
1026 private ParseExceptionHandler parseExceptionHandler;
1027
1028 /**
1029 * The buffer of the input stream.
1030 */
1031 private Buffer buffer;
1032
1033 /* --------------------------------------------------------------------- *
1034 * constructors *
1035 * --------------------------------------------------------------------- */
1036
1037 /**
1038 * Creates a new <code>SGMLParser</code>
1039 * with the default handlers for content and exceptions.
1040 */
1041 @SuppressWarnings("checkstyle:nowhitespaceafter")
1042 public SGMLParser() {
1043 this. contentHandler = new TrivialContentHandler();
1044 this.parseExceptionHandler = new ParseExceptionHandler.Impl();
1045 }
1046
1047 /* --------------------------------------------------------------------- *
1048 * methods *
1049 * --------------------------------------------------------------------- */
1050
1051
1052 /**
1053 * Parses the <code>InputSource</code> given
1054 * but delegates everything inside a tag or a processing instruction
1055 * to {@link #parseTagOrPI}.
1056 *
1057 * @param src
1058 * an <code>InputSource</code>.
1059 * @exception IOException if an error occurs
1060 * @exception SAXException if an error occurs
1061 */
1062 void parse(InputSource src) throws IOException, SAXException {
1063 parse(src.getCharacterStream());
1064 }
1065
1066 /**
1067 * Parses the given <code>InputStream</code>.
1068 *
1069 * @param reader
1070 * an <code>Reader</code> sequentializing an SGML document.
1071 * @exception IOException
1072 * if an error reading the stream occurs.
1073 * @exception SAXException
1074 * if an error with the sgml-syntax occurs.
1075 */
1076 public void parse(Reader reader) throws IOException, SAXException {
1077
1078 this.buffer = new Buffer(reader, BUFFER_SIZE);
1079 int numRead = this.buffer.readArray(TEST_LT);
1080 // notify handler that first part of document was successfully read.
1081 this.contentHandler.startDocument();
1082 while (numRead != -1) {
1083 this.currChar = this.buffer.readChar(); // the '<' char?
1084 if (this.currChar == SYMB_TAG) {
1085 // a tag or a PI.
1086 numRead = parseTagOrPI();
1087 } else {
1088 // either characters or ignoreableWhitespace
1089 numRead = parseText();
1090 }
1091 }
1092 // Here, the document is finished.
1093 this.contentHandler.endDocument();
1094 }
1095
1096 /**
1097 * Parses everything outside a tag, a processing instruction, ...
1098 * everything within brackets <code><</code> and <code>></code>.
1099 * ***** Missing: distinction between notification
1100 * of characters and whitespace. ****
1101 *
1102 * @exception IOException
1103 * if an error reading the stream occurs.
1104 * @exception SAXException
1105 * if an error with the sgml-syntax occurs.
1106 * @see #parseTagOrPI
1107 */
1108 private int parseText() throws IOException, SAXException {
1109 int numRead = this.buffer.readArray(TEST_LT);
1110 if (numRead != -1) {
1111 /*
1112 System.out.println("text: |"+new String(buffer.getChars(),
1113 buffer.getStartAndMove(),
1114 numRead)+"|");
1115 */
1116 this.contentHandler.characters(this.buffer.getChars(),
1117 this.buffer.getStartAndMove(),
1118 numRead);
1119 }
1120
1121 //buffer.getStartAndMove();
1122 return numRead;
1123 }
1124
1125 /**
1126 * Parses an end-tag notifying the underlying handler.
1127 *
1128 * @exception IOException
1129 * if an error reading the stream occurs.
1130 * @exception SAXException
1131 * if an error with the sgml-syntax occurs.
1132 */
1133 void parseEndTag() throws IOException, SAXException {
1134 StringBuffer qName = this.buffer.readStringBuffer(TEST_GT, END_TAG);
1135 // Here, also the empty tag would be possible.
1136
1137 //this.buffer.getStart();
1138 //System.out.println("end tag: |"+qName+"|");
1139 this.currChar = this.buffer.readChar();
1140 //assert this.currChar == '>';
1141 this.contentHandler.endElement(null,
1142 null,
1143 qName.toString());
1144 this.currChar = this.buffer.readChar();
1145 }
1146 /*
1147 public void parseCommentElemTypeDecl()
1148 throws IOException, SAXException {
1149 // ******** comments will not work that way!!!*****
1150
1151 StringBuffer qName = this.buffer
1152 .readStringBuffer(TEST_GT, PROC_INSTR);//**** comment
1153 // Here, also the empty processing instruction or comment
1154 // would be possible.
1155
1156 //this.buffer.getStart();
1157 //System.out.println("-qName: |"+qName+"|");
1158 this.currChar = this.buffer.readChar();
1159 //assert this.currChar == '>';
1160 this.handler.processingInstruction(null, null);
1161 }
1162 */
1163
1164 /**
1165 * Parses a start-tag or, for xml, an empty tag.
1166 *
1167 * @exception IOException
1168 * if an error reading the stream occurs.
1169 * @exception SAXException
1170 * if an error with the sgml-syntax occurs.
1171 */
1172 void parseStartOrStartEndTag() throws IOException, SAXException {
1173
1174 // ***** Better read the name of the tag and
1175 // then single out problems with chars by a handler
1176 if (!Character.isLetter((char) this.currChar)) {
1177 this.parseExceptionHandler
1178 .foundIllegalCharInTag((char) this.currChar);
1179 // Ignore the previously read char.
1180 this.currChar = this.buffer.readChar();
1181 }
1182
1183
1184 StringBuffer qName = this.buffer
1185 .readStringBuffer(TEST_BLANK_GT_SLASH, START_TAG);
1186 qName.insert(0, (char) this.currChar);
1187 // Here, also the empty tag would be possible.
1188 //System.out.println("start tag: |"+qName+"|");
1189
1190 // Skip whitespaces
1191 this.currChar = this.buffer.readChar();
1192 while (Character.isWhitespace((char) this.currChar)) {
1193 this.buffer.
1194 readStringBuffer(TEST_NO_WHITESPACE, WHITESP_IN_ATTR);
1195 this.currChar = this.buffer.readChar();
1196 }
1197
1198 AttributesWrapper attributesWrapper = new AttributesWrapper();
1199 // Here, either /, > or an attribute occurs
1200 //System.out.println("this.currChar: |"+(char)this.currChar+"|");
1201 while (this.currChar != '/' && this.currChar != '>') {
1202 // parse the following attribute list
1203 this.xmlSgmlSpecifica.parseAttribute(attributesWrapper);
1204
1205 // Skip whitespaces
1206 while (Character.isWhitespace((char) this.currChar)) {
1207 this.buffer
1208 .readStringBuffer(TEST_NO_WHITESPACE, WHITESP_IN_ATTR);
1209 this.currChar = this.buffer.readChar();
1210 }
1211 } // end parsing attribute list
1212 //System.out.println("-this.currChar: |"+(char)this.currChar+"|");
1213
1214
1215 Attributes attributes = attributesWrapper.getAttributes();
1216 switch (this.currChar) {
1217 case '/':
1218 // start-end-tag called empty tag
1219
1220 // skip illegal characters between "/" and ">" ****
1221 skipped: while (true) { //NOPMD
1222 this.currChar = this.buffer.readChar();
1223 switch (this.currChar) {
1224 case '>':
1225 break skipped;
1226 case -1:
1227 this.parseExceptionHandler
1228 .foundUnexpectedEndOfDocument();
1229 break;
1230 default:
1231 this.parseExceptionHandler
1232 .foundCharAfterEndOfEndTag
1233 ((char) this.currChar);
1234 break;
1235 } // switch
1236 }
1237
1238 this.contentHandler.startElement(null,
1239 null,
1240 qName.toString(),
1241 attributes);
1242 this.contentHandler.endElement(null,
1243 null,
1244 qName.toString());
1245 break;
1246 case '>':
1247 this.contentHandler.startElement(null,
1248 null,
1249 qName.toString(),
1250 attributes);
1251 break;
1252 default:
1253 throw new SAXParseException
1254 ("Expected finishing tag \"" + qName
1255 + "\" with character '/' or '>' "
1256 + "but found '" + (char) this.currChar + "'. ", null);
1257 }
1258 }
1259
1260 /**
1261 * Parses everything within a tag, a processing instruction, ...
1262 * everything within brackets <code><</code> and <code>></code>.
1263 *
1264 * @see #parseText
1265 */
1266 private int parseTagOrPI() throws IOException, SAXException {
1267 //System.out.println("parseTagOrPI");
1268
1269 this.currChar = this.buffer.readChar();
1270 //this.currChar = this.reader.read();
1271 switch (this.currChar) {
1272 case '/':
1273 // parsing an end-tag
1274 parseEndTag();
1275 //this.currChar = this.reader.read();
1276 break;
1277 case '!':
1278 // parsing no tag at all:
1279 // a processing instruction or a comment
1280 this.xmlSgmlSpecifica.parseCommentElemTypeDecl();
1281 //this.currChar = this.reader.read();
1282 break;
1283 case '?':
1284 // parsing no tag at all:
1285 // a processing instruction or a comment
1286 this.xmlSgmlSpecifica.parseExtProcessingInstruction();
1287 //this.currChar = this.reader.read();
1288 break;
1289 default:
1290 // parsing a start-tag or an empty-element-tag
1291 parseStartOrStartEndTag();
1292 break;
1293 } // end of switch ()
1294 //this.currChar = this.buffer.readChar();
1295 // Here, the buffer is ready
1296 // to read the first character. after the generalized tag.
1297
1298 //System.out.println("read last: |"+(char)this.currChar+"|");
1299 //System.out.println("read last: |"+ this.currChar+"|");
1300
1301 return 1;
1302 }
1303
1304 /**
1305 * Sets {@link #contentHandler}.
1306 *
1307 * @param contentHandler
1308 * a <code>ContentHandler</code>.
1309 */
1310 public void setContentHandler(ContentHandler contentHandler) {
1311 if (isXMLParser()) {
1312 this.contentHandler = contentHandler;
1313 } else {
1314 this.contentHandler = new SGMLFilter(contentHandler);
1315 }
1316 }
1317
1318 /**
1319 * Returns {@link #contentHandler}.
1320 *
1321 * @return
1322 * the <code>ContentHandler</code> {@link #contentHandler}.
1323 */
1324 public ContentHandler getContentHandler() {
1325 if (isXMLParser()) {
1326 return this.contentHandler;
1327 } else {
1328 return ((SGMLFilter) this.contentHandler).getWrapped();
1329 }
1330 }
1331
1332 /**
1333 * Sets {@link #parseExceptionHandler}.
1334 *
1335 * @param peHandler
1336 * a <code>ParseExceptionHandler</code>.
1337 */
1338 public void setExceptionHandler(ParseExceptionHandler peHandler) {
1339 this.parseExceptionHandler = peHandler;
1340 }
1341
1342 /**
1343 * Returns {@link #parseExceptionHandler}.
1344 *
1345 * @return
1346 * the <code>ContentHandler</code> {@link #parseExceptionHandler}.
1347 */
1348 public ParseExceptionHandler getExceptionHandler() {
1349 return this.parseExceptionHandler;
1350 }
1351
1352 /**
1353 * Sets whether this parser is used as an xml-parser.
1354 * If this is false, which is the default,
1355 * it s an html-parser.
1356 *
1357 * @param xml
1358 * a <code>boolean</code> value signifying
1359 * whether this parser will be used as an xml-parser in the sequel.
1360 * @return
1361 * a <code>boolean</code> value signifying
1362 * whether before invoking this method
1363 * this parser was used as an xml-parser
1364 */
1365 public boolean parseXML(boolean xml) {
1366 boolean result = this.xmlSgmlSpecifica == xmlAttributeParser;
1367 this.xmlSgmlSpecifica = xml
1368 ? xmlAttributeParser
1369 : htmlAttributeParser;
1370 return result;
1371 }
1372
1373 public boolean isXMLParser() {
1374 return this.xmlSgmlSpecifica == xmlAttributeParser;
1375 }
1376 }