1
2
3 package eu.simuline.util.sgml;
4
5 import eu.simuline.util.ListMap;
6
7 import java.io.Reader;
8 import java.io.IOException;
9
10 import org.xml.sax.ContentHandler;
11 import org.xml.sax.Locator;
12 import org.xml.sax.Attributes;
13 import org.xml.sax.InputSource;
14 import org.xml.sax.SAXException;
15 import org.xml.sax.SAXParseException;
16 import java.util.Locale;
17
18 /**
19 * A rudimentary <code>SGML</code> parser with something like a SAX-api.
20 *
21 * @author <a href="mailto:ernst.reissner@simuline.eu">Ernst Reissner</a>
22 * @version 1.0
23 */
24 public final class SGMLParser {
25
26 private static final String QUOTE_DOT = "\". ";
27 private static final char SYMB_EQ = '=';
28 private static final char SYMB_COMMENT = '-';
29 private static final char SYMB_TAG = '<';
30
31 /* --------------------------------------------------------------------- *
32 * inner classes *
33 * --------------------------------------------------------------------- */
34
35 /**
36 * A <code>ContentHandler</code> which simply ignores all events.
37 * May be used for debugging.
38 */
39 static class TrivialContentHandler implements ContentHandler {
40
41 /** <!-- api-docs inherited from interface implemented. -->*/
42 public void setDocumentLocator(Locator locator) {
43 // is empty.
44 }
45
46 public void startDocument() throws SAXException {
47 // is empty.
48 }
49
50 public void endDocument() throws SAXException {
51 // is empty.
52 }
53
54 public void startPrefixMapping(String prefix,
55 String uri)
56 throws SAXException {
57 // is empty.
58 }
59
60 public void endPrefixMapping(String prefix)
61 throws SAXException {
62 // is empty.
63 }
64
65 public void startElement(String namespaceURI,
66 String localName,
67 String qName,
68 Attributes atts)
69 throws SAXException {
70 // is empty.
71 }
72
73 public void endElement(String namespaceURI,
74 String localName,
75 String qName)
76 throws SAXException {
77 // is empty.
78 }
79
80 public void characters(char[] chr,
81 int start,
82 int length)
83 throws SAXException {
84 // is empty.
85 }
86
87 public void ignorableWhitespace(char[] chr,
88 int start,
89 int length)
90 throws SAXException {
91 // is empty.
92 }
93
94 public void processingInstruction(String target,
95 String data)
96 throws SAXException {
97 // is empty.
98 }
99
100 public void skippedEntity(String name)
101 throws SAXException {
102 // is empty.
103 }
104 } // class TrivialContentHandler
105
106
107 /**
108 * An **** partial **** implementation
109 * of the SAX-interface <code>Attributes</code>
110 * which allows to set name-value-pairs by method {@link #addAttribute}.
111 */
112 class AttributesWrapper {
113
114 /* ----------------------------------------------------------------- *
115 * fields *
116 * ----------------------------------------------------------------- */
117
118 /**
119 * See {@link AttributesImpl#name2value}.
120 */
121 private final ListMap<String, String> name2value;
122
123 /* ----------------------------------------------------------------- *
124 * constructors *
125 * ----------------------------------------------------------------- */
126
127 /**
128 * Creates a new empty <code>AttributesWrapper</code>
129 * which represents an empty attribute list.
130 */
131 AttributesWrapper() {
132 this.name2value = new ListMap<String, String>();
133 }
134
135 /* ----------------------------------------------------------------- *
136 * methods *
137 * ----------------------------------------------------------------- */
138
139
140 /**
141 * Adds an attribute with the given name and value.
142 *
143 * @param attName
144 * the <code>String</code> representation
145 * of the name of an attribute.
146 * @param attValue
147 * the value of an attribute as a <code>String</code>.
148 * If no value is provided,
149 * this is {@link AttributesImpl#NO_VALUE}.
150 */
151 void addAttribute(String attName, String attValue) {
152 String oldAttValue = this.name2value
153 .put(attName, attValue);
154 if (oldAttValue != null) {
155 // Here, the attribute has occured before.
156 SGMLParser.this.parseExceptionHandler
157 .foundMultipleAttribute(attName,
158 oldAttValue);
159 }
160 }
161
162 Attributes getAttributes() {
163 return new AttributesImpl(this.name2value);
164 }
165 } // class AttributesWrapper
166
167
168 /**
169 * Provides a single method which decides whether the given character
170 * passes a certain test.
171 */
172 interface CharTester {
173
174 /**
175 * Returns whether the given character <code>ch</code>
176 * passes the test given by this <code>CharTester</code>.
177 *
178 * @param chr
179 * an <code>int</code> value representing a character.
180 * @return the <code>boolean</code>
181 * signifying whether the given character <code>ch</code>
182 * passes the test given by this <code>CharTester</code>.
183 */
184 boolean testChar(char chr);
185
186 } // interface CharTester
187
188 /**
189 * Tests for blank, <code>/</code>, <code>></code>.
190 */
191 private static final CharTester TEST_BLANK_GT_SLASH = new CharTester() {
192 public boolean testChar(char chr) {
193 return Character.isWhitespace(chr)
194 || chr == '/'
195 || chr == '>';
196 }
197 };
198
199 /**
200 * Tests for blank or <code>></code>.
201 */
202 private static final CharTester TEST_BLANK_GT = new CharTester() {
203 public boolean testChar(char chr) {
204 return Character.isWhitespace(chr)
205 || chr == '>';
206 }
207 };
208
209 /*
210 * Tests for <code>/</code> or <code>></code>.
211 */
212 /*
213 private static final CharTester TEST_GT_SLASH = new CharTester() {
214 public boolean testChar(char ch) {
215 return ch == '/'
216 || ch == '>';
217 }
218 };
219 */
220
221 /**
222 * Tests for <code><</code>.
223 */
224 private static final CharTester TEST_LT = new CharTester() {
225 public boolean testChar(char chr) {
226 return chr == '<';
227 }
228 };
229
230 /**
231 * Tests for <code>></code>.
232 */
233 private static final CharTester TEST_GT = new CharTester() {
234 public boolean testChar(char chr) {
235 return chr == '>';
236 }
237 };
238
239 /**
240 * Tests for <code>=</code> and for <code>></code>.
241 */
242 private static final CharTester TEST_BLANK_EQUALS_GT = new CharTester() {
243 public boolean testChar(char chr) {
244 return Character.isWhitespace(chr)
245 || chr == '='
246 || chr == '>';
247 }
248 };
249
250 /**
251 * Tests for whitespace.
252 */
253 private static final CharTester TEST_NO_WHITESPACE = new CharTester() {
254 public boolean testChar(char chr) {
255 return !Character.isWhitespace(chr);
256 }
257 };
258
259 /**
260 * Tests for quote both for<code>'</code> and for <code>"</code>.
261 */
262 /*
263 private static final CharTester TEST_QUOTE = new CharTester() {
264 public boolean testChar(char chr) {
265 return chr == '\''
266 || chr == '"';
267 }
268 };
269 */
270
271 /*
272 * Tests for end of comment <code>--></code>.
273 * This tests for a sequence of characters
274 * and confirms after having read the last one.
275 */
276 private static final CharTester TEST_END_OF_COMMENT = new CharTester() {
277
278 /**
279 * Contains the sequence <code>--></code>
280 * representing the end of a comment.
281 */
282 static final String END_OF_COMMENT = "-->";
283
284 /**
285 * Contains the index in {@link #END_OF_COMMENT}
286 * which is to be compared next by {@link #testChar}.
287 */
288 private int index = 0;
289
290 /**
291 * Returns whether the last characters tested
292 * are <code>--></code>.
293 *
294 * @param chr
295 * a <code>char</code>.
296 * @return
297 * whether the last characters tested
298 * including <code>char</code> are <code>--></code>.
299 * In particular, if less than three characters are read
300 * this is <code>false</code>.
301 */
302 public boolean testChar(char chr) {
303 if (END_OF_COMMENT.charAt(this.index) == chr) {
304 this.index++;
305 if (this.index == END_OF_COMMENT.length() - 1) {
306 this.index = 0;
307 return true;
308 } else {
309 return false;
310 }
311 } else {
312 this.index = 0;
313 return false;
314 }
315 }
316 }; // TEST_END_OF_COMMENT
317
318 /**
319 * A <code>CharTester</code> which allows to specify
320 * the character which passes the test.
321 */
322 static class SpecCharTester implements CharTester {
323
324 /**
325 * The character which passes the test {@link #testChar}.
326 */
327 private char chr;
328
329 /**
330 * Sets {@link #chr} to the specified character value.
331 *
332 * @param chr
333 * a <code>char</code> value.
334 */
335 void setChar(char chr) {
336 this.chr = chr;
337 }
338
339 /**
340 * Returns whether the given character coincides with {@link #chr}.
341 *
342 * @param chr
343 * a <code>char</code> value.
344 * @return
345 * whether <code>ch</code> coincides with {@link #chr}.
346 */
347 public boolean testChar(char chr) {
348 return chr == this.chr;
349 }
350 } // SpecCharTester
351
352 /**
353 * Tests for a specified character.
354 * This is used for quotes which allow the cases
355 * <code>'</code> and <code>"</code>.
356 *
357 * @see XMLsGMLspecifica#parseAttribute
358 */
359 private static final SpecCharTester TEST_SPEC = new SpecCharTester();
360
361 /**
362 * Class which buffers the read stream.
363 */
364 static class Buffer {
365
366 /* ----------------------------------------------------------------- *
367 * fields *
368 * ----------------------------------------------------------------- */
369
370 /**
371 * The reader buffered.
372 */
373 private final Reader reader;
374
375 /**
376 * The current buffer.
377 * The current parts to be read start with
378 * <code>bufferArray[{@link #start}]</code> and end with
379 * <code>bufferArray[{@link #end}]</code>, exclusively.
380 */
381 private final char[] bufferArray;
382
383 /**
384 * The first index in {@link #bufferArray}
385 * read in from {@link #reader} but not returned
386 * by {@link #readArray} or {@link #readChar}.
387 */
388 private int start;
389
390 /**
391 * Set by {@link #readArray} and read by {@link #getStartAndMove}.
392 * When invoking {@link #readArray} <code>newStart</code>
393 * is set to {@link #start} and increased
394 * by the number of read charactersincreases.
395 * Then {@link #getStartAndMove} updates {@link #start}
396 * according to <code>newStart</code>.
397 */
398 private int newStart;
399
400 /**
401 * The first index in {@link #bufferArray} not read
402 * from {@link #reader}
403 * or <code>-1</code> if the end of the stream is reached.
404 * This means that <code>bufferArray[end]</code>
405 * either does not exist or at least is not significant.
406 */
407 private int end;
408
409 /* ----------------------------------------------------------------- *
410 * constructors *
411 * ----------------------------------------------------------------- */
412
413 /**
414 * Creates a new <code>Buffer</code> from the given reader
415 * with the given size.
416 *
417 * @param reader
418 * the <code>Reader</code> to be buffered.
419 * @param length
420 * the length of the buffer.
421 * @exception IOException
422 * if an error occurs
423 */
424 Buffer(Reader reader, int length) throws IOException {
425 this.reader = reader;
426 this.bufferArray = new char[length];
427 this.start = 0;
428 this.end = this.start; // signifies: reading necessary.
429 }
430
431 /* ----------------------------------------------------------------- *
432 * methods *
433 * ----------------------------------------------------------------- */
434
435
436 /**
437 * Returns whether this buffer is currently empty.
438 * When this is the case and someone tries to read further characters
439 * this will lead to a trial
440 * to read further pieces from {@link #reader}.
441 *
442 * @return a <code>boolean</code> value
443 * signifying whether this buffer is currently empty.
444 */
445 boolean isEmpty() {
446 return this.end == this.start;
447 }
448
449 /**
450 * Returns whether the end of the stream is reached.
451 *
452 * @return
453 * a <code>boolean</code> specifying
454 * whether the end of the stream is reached.
455 */
456 boolean reachedEOS() {
457 return this.end == -1;
458 }
459
460 /**
461 * Reads a single <code>char</code> and returns it.
462 *
463 * @return
464 * an <code>int</code> value
465 * which is either the next <code>char</code> read in
466 * or <code>-1</code> which signifies the end of the stream.
467 * @exception IOException
468 * if an error occurs
469 */
470 int readChar() throws IOException {
471 if (reachedEOS()) {
472 return -1;
473 }
474 if (isEmpty()) {
475 this.start = 0;
476 this.end = this.reader.read(this.bufferArray);
477 if (reachedEOS()) {
478 return -1;
479 }
480 }
481 return this.bufferArray[this.start++];
482 }
483
484 /**
485 * Reads an array from {@link #reader}.
486 * As a side effect, writes the field {@link #newStart}.
487 * Also, if the portion of {@link #bufferArray}
488 * to be read, i.e. between {@link #start} and {@link #end},
489 * is empty, a new portion is buffered.
490 *
491 * @param charTester
492 * a <code>CharTester</code> which signifies
493 * when to end reading from the buffer.
494 * @return
495 * an <code>int</code> signifying the number of <code>char</code>s
496 * read or <code>-1</code> which signifies the end of the stream.
497 * It is read to the next < or, if there is none,
498 * to the end of the stream.
499 * Thus there is a difference between the return values
500 * <code>-1</code> and <code>0</code>.
501 * @exception IOException
502 * if an error occurs
503 */
504 int readArray(CharTester charTester) throws IOException {
505 if (reachedEOS()) {
506 return -1;
507 }
508 if (isEmpty()) {
509 this.start = 0;
510 this.end = this.reader.read(this.bufferArray);
511 //System.out.println("read: "+this.end);
512 if (reachedEOS()) {
513 return -1;
514 }
515 }
516
517 for (int i = this.start; i < this.end; i++) {
518 if (charTester.testChar(this.bufferArray[i])) {
519 // found match described by charTester
520 this.newStart = i;
521 return this.newStart - this.start;
522 }
523 }
524 // Here, the test always failed.
525 this.newStart = this.end;
526 return this.end - this.start;
527 }
528
529 /**
530 * Describe <code>readStringBuffer</code> method here.
531 *
532 * @param charTester
533 * a <code>CharTester</code> which determines
534 * the first character not read
535 * into the resulting <code>StringBuffer</code>.
536 * @param elementName
537 * a <code>String</code> which determines
538 * the element under consideration.
539 * This is only used for generating the message of a
540 * <code>SAXParseException</code>.
541 * <p>
542 * Allowed values: {@link #START_TAG}, {@link #END_TAG},
543 * {@link #PROC_INSTR},
544 * {@link #ATTR_NAME}, {@link #WHITESP_IN_ATTR}
545 * and {@link #ATTR_VALUE}. ****** comment and <!element missing.
546 * @return
547 * a <code>StringBuffer</code> containing characters
548 * starting with the current one until one
549 * <code>charTester</code> returns <code>true</code>.
550 * @exception IOException
551 * if an io-error occurs
552 * @exception SAXParseException
553 * if the parser faces the end of the stream
554 * while scanning the current element.
555 */
556 StringBuffer readStringBuffer(CharTester charTester,
557 String elementName)
558 throws IOException, SAXParseException {
559
560 StringBuffer qName = new StringBuffer();
561 int numRead = 0;
562 do {
563 numRead = readArray(charTester);
564 if (numRead == -1) {
565 throw new SAXParseException
566 ("End of stream while scanning "
567 + elementName + ". "
568 + "Read so far: \""
569 + qName + QUOTE_DOT, null);
570 }
571 qName.append(getChars(),
572 getStartAndMove(),
573 numRead);
574 } while (isEmpty());
575
576 return qName;
577 }
578
579 /**
580 * Returns the buffer of <code>char</code>s.
581 *
582 * @return
583 * the <code>char[]</code> {@link #bufferArray}.
584 */
585 char[] getChars() {
586 return this.bufferArray;
587 }
588
589 /**
590 * Moves {@link #newStart} to {@link #start}
591 * and returns the old value of {@link #start}.
592 *
593 * @return
594 * the old <code>int</code> value of {@link #start}.
595 */
596 int getStartAndMove() {
597 int ret = this.start;
598 this.start = this.newStart;
599 return ret;
600 }
601
602 /**
603 * Get method for {@link #start}.
604 *
605 * @return {@link #start}
606 */
607 int getStart() {
608 return this.start;
609 }
610
611 /**
612 * Get method for {@link #end}.
613 *
614 * @return {@link #end}
615 */
616 int getEnd() {
617 return this.end;
618 }
619 } // class Buffer
620
621 /**
622 * Provides a bunch of methods fpr parsing
623 * with implementations specific to xml and sgml.
624 */
625 interface XMLsGMLspecifica {
626 // **** SGMLParser.this.currChar must be the character
627 // after the attribute list.
628 /**
629 * Parses one attribute and adds it to the given attribute list.
630 *
631 * @param attributes
632 * an <code>AttributesImpl</code>
633 * to which the attribute parsed is added.
634 * @exception IOException
635 * if an io-error occurs
636 * @exception SAXException
637 * if a syntactical error occurs
638 */
639 void parseAttribute(AttributesWrapper attributes)
640 throws IOException, SAXException;
641
642 /**
643 * Parses a comment or any declaration
644 * starting with <code><!...</code> and notifying the handler.
645 *
646 * @exception IOException
647 * if an io-error occurs
648 * @exception SAXException
649 * if a syntactical error occurs
650 */
651 void parseCommentElemTypeDecl() throws IOException, SAXException;
652
653 /**
654 * Parses a processing instruction or any declaration
655 * starting with <code><?...</code> and notifying the handler.
656 *
657 * @exception IOException
658 * if an io-error occurs
659 * @exception SAXException
660 * if a syntactical error occurs
661 */
662 void parseExtProcessingInstruction() throws IOException, SAXException;
663
664 } // interface XML_SGML_Specifica
665
666 /**
667 * Contains the <code>HTML</code>-specific part of the parser.
668 */
669 private final XMLsGMLspecifica htmlAttributeParser =
670 new XMLsGMLspecifica() {
671
672 public void parseAttribute(AttributesWrapper attributes)
673 throws IOException, SAXException {
674 String attName;
675 String attValue;
676 StringBuffer qName;
677
678 // Parse attribute name
679 qName = SGMLParser.this.buffer.
680 readStringBuffer(TEST_BLANK_EQUALS_GT, ATTR_NAME);
681 qName.insert(0, (char) SGMLParser.this.currChar);
682 attName = qName.toString().toLowerCase(Locale.ENGLISH);
683 //System.out.println("attName: |"+attName+"|");
684
685 // Here, the attribute may have a value or not.
686
687 // Skip whitespace either after having parsed the attribute
688 // or between its name and its value.
689 SGMLParser.this.currChar =
690 SGMLParser.this.buffer.readChar(); //NOPMD
691 if (Character.isWhitespace((char) SGMLParser.this.currChar)) {
692 qName = SGMLParser.this.buffer.
693 readStringBuffer(TEST_NO_WHITESPACE, WHITESP_IN_ATTR);
694 SGMLParser.this.currChar =
695 SGMLParser.this.buffer.readChar(); //NOPMD
696 }
697
698 // Here is the decision whether a value is provided or not.
699 if (SGMLParser.this.currChar != SYMB_EQ) {
700 // Here, no value may be given
701 attributes.addAttribute(attName, AttributesImpl.NO_VALUE);
702 //System.out.println("attName: |"+attName+"|");
703 //System.out.println("noValue@@"+(char)SGMLParser.this.currChar+"|");
704 return;
705 }
706 // Here, clearly a value must follow
707
708 // Skip whitespaces
709 qName = SGMLParser.this.buffer.
710 readStringBuffer(TEST_NO_WHITESPACE, WHITESP_IN_ATTR);
711 SGMLParser.this.currChar =
712 SGMLParser.this.buffer.readChar(); //NOPMD
713
714 // Parse the attribute value.
715 switch (SGMLParser.this.currChar) {
716 case '\'':
717 // fall through
718 case '"':
719 // the attribute value is quoted.
720 char quote = (char) SGMLParser.this.currChar;
721 TEST_SPEC.setChar(quote);
722 //SGMLParser.this.currChar =
723 // SGMLParser.this.buffer.readChar();
724
725 //System.out.println("quote@@"+SGMLParser.this.currChar);
726 qName = new StringBuffer();
727 while (true) {
728 qName.append(SGMLParser.this.buffer.
729 readStringBuffer(TEST_SPEC, ATTR_VALUE));
730 if (qName.length() != 0
731 && qName.charAt(qName.length() - 1) == '\\') {
732 qName.setCharAt(qName.length() - 1, quote);
733 } else {
734 // read the quote
735 SGMLParser.this.currChar = //NOPMD
736 SGMLParser.this.buffer.readChar();
737 break;
738 }
739 }
740 break;
741 default:
742 //System.out.println("no quote@@"+SGMLParser.this.currChar);
743 // the attribute value is not quoted.
744 qName = SGMLParser.this.buffer.
745 readStringBuffer(TEST_BLANK_GT, ATTR_VALUE);
746 qName.insert(0, (char) SGMLParser.this.currChar);
747 break;
748 }
749 // read the character after the attribute value
750 SGMLParser.this.currChar =
751 SGMLParser.this.buffer.readChar(); //NOPMD
752
753 attValue = qName.toString();
754 attributes.addAttribute(attName, attValue);
755 //System.out.println("attName: |"+attName+"|");
756 //System.out.println("attValue: |"+attValue+"|");
757 }
758
759 public void parseCommentElemTypeDecl()
760 throws IOException, SAXException {
761 //System.out.println("comment?");
762
763 SGMLParser.this.currChar = //NOPMD
764 SGMLParser.this.buffer.readChar();
765 if (SGMLParser.this.currChar != SYMB_COMMENT) {
766 //int numRead =
767 SGMLParser.this.buffer.readArray(TEST_GT);
768 SGMLParser.this.buffer.getStartAndMove();
769 return;
770 }
771 // Here, object starts with "<!-....."
772
773 SGMLParser.this.currChar = //NOPMD
774 SGMLParser.this.buffer.readChar();
775 if (SGMLParser.this.currChar != SYMB_COMMENT) {
776 throw new SAXParseException
777 ("Comments must start with \"<!--\" but found "
778 + "\"<!-" + (char) SGMLParser.this.currChar + QUOTE_DOT,
779 null);
780 }
781 //System.out.println("comment!");
782
783 int numRead = 0;
784 do {
785 numRead = SGMLParser.this.buffer
786 .readArray(TEST_END_OF_COMMENT);
787 if (numRead == -1) {
788 StringBuffer qName = new StringBuffer();
789 qName.append(SGMLParser.this.buffer.getChars(),
790 SGMLParser.this.buffer.getStartAndMove(),
791 numRead);
792 throw new SAXParseException
793 ("End of stream while scanning comment. "
794 + "Recently read: \"" + qName + QUOTE_DOT,
795 null);
796 }
797
798 SGMLParser.this.buffer.getStartAndMove();
799 } while (SGMLParser.this.buffer.isEmpty());
800 /*
801 StringBuffer qName = new StringBuffer();
802 qName.append(SGMLParser.this.buffer.getChars(),
803 SGMLParser.this.buffer.getStartAndMove(),
804 numRead);
805
806 System.out.println("read so far: |"+qName+"|");
807 */
808
809 SGMLParser.this.buffer.getStartAndMove();
810 // NO NOTIFY!!
811 }
812
813 public void parseExtProcessingInstruction()
814 throws IOException, SAXException {
815 parseStartOrStartEndTag();
816 }
817 }; // htmlXML_SGML_Specifica
818
819 /**
820 * Contains the <code>XML</code>-specific part of the parser.
821 */
822 private final XMLsGMLspecifica xmlAttributeParser =
823 new XMLsGMLspecifica() {
824
825 public void parseAttribute(AttributesWrapper attributes)
826 throws IOException, SAXException {
827 String attName;
828 String attValue;
829 StringBuffer qName;
830
831 // Parse attribute name
832 qName = SGMLParser.this.buffer.
833 readStringBuffer(TEST_BLANK_EQUALS_GT, ATTR_NAME);
834 qName.insert(0, (char) SGMLParser.this.currChar);
835 attName = qName.toString();
836 //System.out.println("attName: |"+attName+"|");
837
838 // Here, the attribute may have a value or not.
839
840 // Skip whitespace either after having parsed the attribute
841 // or between its name and its value.
842 SGMLParser.this.currChar =
843 SGMLParser.this.buffer.readChar(); //NOPMD
844 if (Character.isWhitespace((char) SGMLParser.this.currChar)) {
845 qName = SGMLParser.this.buffer.
846 readStringBuffer(TEST_NO_WHITESPACE, WHITESP_IN_ATTR);
847 SGMLParser.this.currChar =
848 SGMLParser.this.buffer.readChar(); //NOPMD
849 }
850
851 // Here is the decision whether a value is provided or not.
852 if (SGMLParser.this.currChar != SYMB_EQ) {
853 // Here, a value is missing.
854 throw new SAXParseException
855 ("Missing value for attribute \""
856 + attName + QUOTE_DOT, null);
857 }
858 // Here, clearly a value must follow ****
859
860
861 // Skip whitespaces
862 qName = SGMLParser.this.buffer.
863 readStringBuffer(TEST_NO_WHITESPACE, WHITESP_IN_ATTR);
864 SGMLParser.this.currChar =
865 SGMLParser.this.buffer.readChar(); //NOPMD
866
867
868 // Parse the attribute value.
869 switch (SGMLParser.this.currChar) {
870 case '\'':
871 // fall through
872 case '"':
873 // the attribute value is quoted.
874 char quote = (char) SGMLParser.this.currChar;
875 TEST_SPEC.setChar(quote);
876 //SGMLParser.this.currChar =
877 // SGMLParser.this.buffer.readChar();
878
879 //System.out.println("quote@@"+SGMLParser.this.currChar);
880 qName = new StringBuffer();
881 while (true) {
882 qName.append(SGMLParser.this.buffer.
883 readStringBuffer(TEST_SPEC, ATTR_VALUE));
884 if (qName.length() != 0
885 && qName.charAt(qName.length() - 1) == '\\') {
886 qName.setCharAt(qName.length() - 1, quote);
887 } else {
888 // read the quote
889 SGMLParser.this.currChar = //NOPMD
890 SGMLParser.this.buffer.readChar();
891 break;
892 }
893 }
894 break;
895 default:
896 throw new SAXParseException
897 ("Value of attribute \"" + attName +
898 "\" is not quoted. ",
899 null);
900 }
901 // read the character after the attribute value
902 SGMLParser.this.currChar =
903 SGMLParser.this.buffer.readChar(); //NOPMD
904
905 attValue = qName.toString();
906 attributes.addAttribute(attName, attValue);
907 //System.out.println("attName: |"+attName+"|");
908 //System.out.println("attValue: |"+attValue+"|");
909 }
910
911 public void parseCommentElemTypeDecl()
912 throws IOException, SAXException {
913 // ******** comments will not work that way!!!*****
914
915 SGMLParser.this.buffer.readStringBuffer(TEST_GT, PROC_INSTR);
916 //**** comment
917 // Here, also the empty processing instruction or comment
918 // would be possible.
919
920 //this.buffer.getStart();
921 //System.out.println("-qName: |"+qName+"|");
922 SGMLParser.this.currChar =
923 SGMLParser.this.buffer.readChar(); //NOPMD
924 //assert this.currChar == '>';
925 SGMLParser.this.contentHandler.processingInstruction(null, null);
926 }
927
928
929 public void parseExtProcessingInstruction()
930 throws IOException, SAXException {
931
932 //StringBuffer qName =
933 SGMLParser.this.buffer.readStringBuffer(TEST_GT, PROC_INSTR);
934 // Here, also the empty processing instruction would be possible.
935
936 //this.buffer.getStart();
937 //System.out.println("-qName: |"+qName+"|");
938 SGMLParser.this.currChar =
939 SGMLParser.this.buffer.readChar(); //NOPMD
940 //assert this.currChar == '>';
941 SGMLParser.this.contentHandler.processingInstruction(null, null);
942 }
943
944 }; // xmlXML_SGML_Specifica
945
946 /* --------------------------------------------------------------------- *
947 * class constants *
948 * --------------------------------------------------------------------- */
949
950
951 /**
952 * The size of the buffer used internally.
953 * This must be at least <code>1</code>.
954 * I found no significant difference in speed when increasing this number.
955 * The buffer coming from a stream from a URL seems to hav maximal size
956 * of <code>1448</code> whereas for file streams there seems no bound.
957 * In the cases considered, the file is read in as a whole.
958 */
959 private static final int BUFFER_SIZE = 999999;
960
961 // for notification of a sax parse exception with Buffer.readStringBuffer.
962 /**
963 * Short string representation of the object currently parsed.
964 * Contains the specific part of the message of the exception
965 * that may be thrown by {@link SGMLParser.Buffer#readStringBuffer}.
966 */
967 private static final String START_TAG = "start tag";
968
969 /**
970 * Short string representation of the object currently parsed.
971 * Contains the specific part of the message of the exception
972 * that may be thrown by {@link SGMLParser.Buffer#readStringBuffer}.
973 */
974 private static final String END_TAG = "end tag";
975
976 /**
977 * Short string representation of the object currently parsed.
978 * Contains the specific part of the message of the exception
979 * that may be thrown by {@link SGMLParser.Buffer#readStringBuffer}.
980 */
981 private static final String PROC_INSTR = "processing instruction";
982
983 /**
984 * Short string representation of the object currently parsed.
985 * Contains the specific part of the message of the exception
986 * that may be thrown by {@link SGMLParser.Buffer#readStringBuffer}.
987 */
988 private static final String ATTR_NAME = "attribute name";
989
990 /**
991 * Short string representation of the object currently parsed.
992 * Contains the specific part of the message of the exception
993 * that may be thrown by {@link SGMLParser.Buffer#readStringBuffer}.
994 */
995 private static final String WHITESP_IN_ATTR = "whitespace in attribute";
996
997 /**
998 * Short string representation of the object currently parsed.
999 * Contains the specific part of the message of the exception
1000 * that may be thrown by {@link SGMLParser.Buffer#readStringBuffer}.
1001 */
1002 private static final String ATTR_VALUE = "attribute value";
1003
1004 /* --------------------------------------------------------------------- *
1005 * fields *
1006 * --------------------------------------------------------------------- */
1007
1008 /**
1009 * Contains class with methods specific for xml and sgml, respectively.
1010 */
1011 private XMLsGMLspecifica xmlSgmlSpecifica = htmlAttributeParser;
1012
1013 /**
1014 * The current character or <code>-1</code>
1015 * to signfy the end of the stream.
1016 */
1017 private int currChar;
1018
1019 /**
1020 * The <code>ContentHandler</code> registered.
1021 */
1022 private ContentHandler contentHandler;
1023
1024 /**
1025 * The <code>ParseExceptionHandler</code> registered.
1026 */
1027 private ParseExceptionHandler parseExceptionHandler;
1028
1029 /**
1030 * The buffer of the input stream.
1031 */
1032 private Buffer buffer;
1033
1034 /* --------------------------------------------------------------------- *
1035 * constructors *
1036 * --------------------------------------------------------------------- */
1037
1038 /**
1039 * Creates a new <code>SGMLParser</code>
1040 * with the default handlers for content and exceptions.
1041 */
1042 @SuppressWarnings("checkstyle:nowhitespaceafter")
1043 public SGMLParser() {
1044 this. contentHandler = new TrivialContentHandler();
1045 this.parseExceptionHandler = new ParseExceptionHandler.Impl();
1046 }
1047
1048 /* --------------------------------------------------------------------- *
1049 * methods *
1050 * --------------------------------------------------------------------- */
1051
1052
1053 /**
1054 * Parses the <code>InputSource</code> given
1055 * but delegates everything inside a tag or a processing instruction
1056 * to {@link #parseTagOrPI}.
1057 *
1058 * @param src
1059 * an <code>InputSource</code>.
1060 * @exception IOException if an error occurs
1061 * @exception SAXException if an error occurs
1062 */
1063 void parse(InputSource src) throws IOException, SAXException {
1064 parse(src.getCharacterStream());
1065 }
1066
1067 /**
1068 * Parses the given <code>InputStream</code>.
1069 *
1070 * @param reader
1071 * an <code>Reader</code> sequentializing an SGML document.
1072 * @exception IOException
1073 * if an error reading the stream occurs.
1074 * @exception SAXException
1075 * if an error with the sgml-syntax occurs.
1076 */
1077 public void parse(Reader reader) throws IOException, SAXException {
1078
1079 this.buffer = new Buffer(reader, BUFFER_SIZE);
1080 int numRead = this.buffer.readArray(TEST_LT);
1081 // notify handler that first part of document was successfully read.
1082 this.contentHandler.startDocument();
1083 while (numRead != -1) {
1084 this.currChar = this.buffer.readChar(); // the '<' char?
1085 if (this.currChar == SYMB_TAG) {
1086 // a tag or a PI.
1087 numRead = parseTagOrPI();
1088 } else {
1089 // either characters or ignoreableWhitespace
1090 numRead = parseText();
1091 }
1092 }
1093 // Here, the document is finished.
1094 this.contentHandler.endDocument();
1095 }
1096
1097 /**
1098 * Parses everything outside a tag, a processing instruction, ...
1099 * everything within brackets <code><</code> and <code>></code>.
1100 * ***** Missing: distinction between notification
1101 * of characters and whitespace. ****
1102 *
1103 * @exception IOException
1104 * if an error reading the stream occurs.
1105 * @exception SAXException
1106 * if an error with the sgml-syntax occurs.
1107 * @see #parseTagOrPI
1108 */
1109 private int parseText() throws IOException, SAXException {
1110 int numRead = this.buffer.readArray(TEST_LT);
1111 if (numRead != -1) {
1112 /*
1113 System.out.println("text: |"+new String(buffer.getChars(),
1114 buffer.getStartAndMove(),
1115 numRead)+"|");
1116 */
1117 this.contentHandler.characters(this.buffer.getChars(),
1118 this.buffer.getStartAndMove(),
1119 numRead);
1120 }
1121
1122 //buffer.getStartAndMove();
1123 return numRead;
1124 }
1125
1126 /**
1127 * Parses an end-tag notifying the underlying handler.
1128 *
1129 * @exception IOException
1130 * if an error reading the stream occurs.
1131 * @exception SAXException
1132 * if an error with the sgml-syntax occurs.
1133 */
1134 void parseEndTag() throws IOException, SAXException {
1135 StringBuffer qName = this.buffer.readStringBuffer(TEST_GT, END_TAG);
1136 // Here, also the empty tag would be possible.
1137
1138 //this.buffer.getStart();
1139 //System.out.println("end tag: |"+qName+"|");
1140 this.currChar = this.buffer.readChar();
1141 //assert this.currChar == '>';
1142 this.contentHandler.endElement(null,
1143 null,
1144 qName.toString());
1145 this.currChar = this.buffer.readChar();
1146 }
1147 /*
1148 public void parseCommentElemTypeDecl()
1149 throws IOException, SAXException {
1150 // ******** comments will not work that way!!!*****
1151
1152 StringBuffer qName = this.buffer
1153 .readStringBuffer(TEST_GT, PROC_INSTR);//**** comment
1154 // Here, also the empty processing instruction or comment
1155 // would be possible.
1156
1157 //this.buffer.getStart();
1158 //System.out.println("-qName: |"+qName+"|");
1159 this.currChar = this.buffer.readChar();
1160 //assert this.currChar == '>';
1161 this.handler.processingInstruction(null, null);
1162 }
1163 */
1164
1165 /**
1166 * Parses a start-tag or, for xml, an empty tag.
1167 *
1168 * @exception IOException
1169 * if an error reading the stream occurs.
1170 * @exception SAXException
1171 * if an error with the sgml-syntax occurs.
1172 */
1173 void parseStartOrStartEndTag() throws IOException, SAXException {
1174
1175 // ***** Better read the name of the tag and
1176 // then single out problems with chars by a handler
1177 if (!Character.isLetter((char) this.currChar)) {
1178 this.parseExceptionHandler
1179 .foundIllegalCharInTag((char) this.currChar);
1180 // Ignore the previously read char.
1181 this.currChar = this.buffer.readChar();
1182 }
1183
1184
1185 StringBuffer qName = this.buffer
1186 .readStringBuffer(TEST_BLANK_GT_SLASH, START_TAG);
1187 qName.insert(0, (char) this.currChar);
1188 // Here, also the empty tag would be possible.
1189 //System.out.println("start tag: |"+qName+"|");
1190
1191 // Skip whitespaces
1192 this.currChar = this.buffer.readChar();
1193 while (Character.isWhitespace((char) this.currChar)) {
1194 this.buffer.
1195 readStringBuffer(TEST_NO_WHITESPACE, WHITESP_IN_ATTR);
1196 this.currChar = this.buffer.readChar();
1197 }
1198
1199 AttributesWrapper attributesWrapper = new AttributesWrapper();
1200 // Here, either /, > or an attribute occurs
1201 //System.out.println("this.currChar: |"+(char)this.currChar+"|");
1202 while (this.currChar != '/' && this.currChar != '>') {
1203 // parse the following attribute list
1204 this.xmlSgmlSpecifica.parseAttribute(attributesWrapper);
1205
1206 // Skip whitespaces
1207 while (Character.isWhitespace((char) this.currChar)) {
1208 this.buffer
1209 .readStringBuffer(TEST_NO_WHITESPACE, WHITESP_IN_ATTR);
1210 this.currChar = this.buffer.readChar();
1211 }
1212 } // end parsing attribute list
1213 //System.out.println("-this.currChar: |"+(char)this.currChar+"|");
1214
1215
1216 Attributes attributes = attributesWrapper.getAttributes();
1217 switch (this.currChar) {
1218 case '/':
1219 // start-end-tag called empty tag
1220
1221 // skip illegal characters between "/" and ">" ****
1222 skipped: while (true) { //NOPMD
1223 this.currChar = this.buffer.readChar();
1224 switch (this.currChar) {
1225 case '>':
1226 break skipped;
1227 case -1:
1228 this.parseExceptionHandler
1229 .foundUnexpectedEndOfDocument();
1230 break;
1231 default:
1232 this.parseExceptionHandler
1233 .foundCharAfterEndOfEndTag
1234 ((char) this.currChar);
1235 break;
1236 } // switch
1237 }
1238
1239 this.contentHandler.startElement(null,
1240 null,
1241 qName.toString(),
1242 attributes);
1243 this.contentHandler.endElement(null,
1244 null,
1245 qName.toString());
1246 break;
1247 case '>':
1248 this.contentHandler.startElement(null,
1249 null,
1250 qName.toString(),
1251 attributes);
1252 break;
1253 default:
1254 throw new SAXParseException
1255 ("Expected finishing tag \"" + qName
1256 + "\" with character '/' or '>' "
1257 + "but found '" + (char) this.currChar + "'. ", null);
1258 }
1259 }
1260
1261 /**
1262 * Parses everything within a tag, a processing instruction, ...
1263 * everything within brackets <code><</code> and <code>></code>.
1264 *
1265 * @see #parseText
1266 */
1267 private int parseTagOrPI() throws IOException, SAXException {
1268 //System.out.println("parseTagOrPI");
1269
1270 this.currChar = this.buffer.readChar();
1271 //this.currChar = this.reader.read();
1272 switch (this.currChar) {
1273 case '/':
1274 // parsing an end-tag
1275 parseEndTag();
1276 //this.currChar = this.reader.read();
1277 break;
1278 case '!':
1279 // parsing no tag at all:
1280 // a processing instruction or a comment
1281 this.xmlSgmlSpecifica.parseCommentElemTypeDecl();
1282 //this.currChar = this.reader.read();
1283 break;
1284 case '?':
1285 // parsing no tag at all:
1286 // a processing instruction or a comment
1287 this.xmlSgmlSpecifica.parseExtProcessingInstruction();
1288 //this.currChar = this.reader.read();
1289 break;
1290 default:
1291 // parsing a start-tag or an empty-element-tag
1292 parseStartOrStartEndTag();
1293 break;
1294 } // end of switch ()
1295 //this.currChar = this.buffer.readChar();
1296 // Here, the buffer is ready
1297 // to read the first character. after the generalized tag.
1298
1299 //System.out.println("read last: |"+(char)this.currChar+"|");
1300 //System.out.println("read last: |"+ this.currChar+"|");
1301
1302 return 1;
1303 }
1304
1305 /**
1306 * Sets {@link #contentHandler}.
1307 *
1308 * @param contentHandler
1309 * a <code>ContentHandler</code>.
1310 */
1311 public void setContentHandler(ContentHandler contentHandler) {
1312 if (isXMLParser()) {
1313 this.contentHandler = contentHandler;
1314 } else {
1315 this.contentHandler = new SGMLFilter(contentHandler);
1316 }
1317 }
1318
1319 /**
1320 * Returns {@link #contentHandler}.
1321 *
1322 * @return
1323 * the <code>ContentHandler</code> {@link #contentHandler}.
1324 */
1325 public ContentHandler getContentHandler() {
1326 if (isXMLParser()) {
1327 return this.contentHandler;
1328 } else {
1329 return ((SGMLFilter) this.contentHandler).getWrapped();
1330 }
1331 }
1332
1333 /**
1334 * Sets {@link #parseExceptionHandler}.
1335 *
1336 * @param peHandler
1337 * a <code>ParseExceptionHandler</code>.
1338 */
1339 public void setExceptionHandler(ParseExceptionHandler peHandler) {
1340 this.parseExceptionHandler = peHandler;
1341 }
1342
1343 /**
1344 * Returns {@link #parseExceptionHandler}.
1345 *
1346 * @return
1347 * the <code>ContentHandler</code> {@link #parseExceptionHandler}.
1348 */
1349 public ParseExceptionHandler getExceptionHandler() {
1350 return this.parseExceptionHandler;
1351 }
1352
1353 /**
1354 * Sets whether this parser is used as an xml-parser.
1355 * If this is false, which is the default,
1356 * it s an html-parser.
1357 *
1358 * @param xml
1359 * a <code>boolean</code> value signifying
1360 * whether this parser will be used as an xml-parser in the sequel.
1361 * @return
1362 * a <code>boolean</code> value signifying
1363 * whether before invoking this method
1364 * this parser was used as an xml-parser
1365 */
1366 public boolean parseXML(boolean xml) {
1367 boolean result = this.xmlSgmlSpecifica == xmlAttributeParser;
1368 this.xmlSgmlSpecifica = xml
1369 ? xmlAttributeParser
1370 : htmlAttributeParser;
1371 return result;
1372 }
1373
1374 public boolean isXMLParser() {
1375 return this.xmlSgmlSpecifica == xmlAttributeParser;
1376 }
1377 }