View Javadoc
1   /***
2    * Title:        S/MIME Project
3    * Description:  Creating S/MIME email transport capabilities.
4    * Copyright:    Copyright (c) 2001
5    * @Author       Vladimir Radisic
6    * @Version      2.0.1
7    */
8   
9   package org.webdocwf.util.smime.util;
10  
11  
12  import org.webdocwf.util.smime.exception.SMIMEException;
13  import java.util.Vector;
14  import java.net.MalformedURLException;
15  import java.net.URL;
16  import java.io.InputStream;
17  import java.io.ByteArrayOutputStream;
18  import java.io.File;
19  import org.w3c.dom.Attr;
20  import org.w3c.dom.Document;
21  import org.w3c.dom.NamedNodeMap;
22  import org.w3c.dom.Node;
23  import org.w3c.dom.NodeList;
24  import org.w3c.tidy.Tidy;
25  
26  
27  /***
28   * HtmlAnalyzer class is used for parsing html code which has to become content
29   * of the message. For parsing is used JTidy parser. As result of parsing, DOM
30   * (Document Object Model) structure is obtained. It is tree-like construction
31   * with nodes and hierarchical structures that descript input html code. This
32   * structure is easy for browsing and searching for specific html elements and
33   * attributes. By using DOM, all references to resources (image, movie, sound... ),
34   * defined in "src" and "background" attributes, are explored and swapped with
35   * generated unique Content-ID values which are necessary in forming
36   * "multipart/related" MimeMultipart object.<BR>
37   * <BR>
38   * DOM, generated inside of the object of this class, is also used in the process of
39   * generation plain/text message based on, and derived from the given html code.
40   * This plain text is later used in creation of "multipart/alternative"
41   * MimeMultipart object.
42   */
43  public class HtmlAnalyzer {
44  
45      /***
46       * plain/text representation of page
47       */
48      private String plainText = "";
49  
50      /***
51       * Enable/disable p tag in text/html to text/plain conversion.
52       */
53      private boolean pTagEnable = true;
54  
55      /***
56       * Path to html file or prefix path to the embeded resource's adresses in
57       * html code (for example for "src" attribute of IMG tag). Can be null which
58       * means that prefix won't be added to resources location in the process of
59       * searching for specific adress attributes given in html code.
60       */
61      private String absolutPath = null;
62  
63      /***
64       * Container for parsed html document in DOM (Document Object Model)
65       * representation.
66       */
67      private Document doc;
68  
69      /***
70       * Indent from left margin pointer. This information is used in the process of
71       * generation plain text message based on html code.
72       */
73      private int indent = 0;
74  
75      /***
76       * Current sequential number of OL (ordered list) html element. This information
77       * is used in the process of generation plain text message based on html code.
78       */
79      private int olNumber = 1;
80  
81      /***
82       * Current html element is OL (ordered list), UN (unordered list) or something
83       * else. This information is used in the process of generation plain text message based
84       * on html code.
85       */
86      private String ul_ol = "";
87  
88      /***
89       * Constant used in generating indent from left side. This information is used in
90       * the process of generation plain text message based on html code.
91       */
92      private final String indentString =
93          "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t";
94  
95      /***
96       * Container for storing pairs of replaced url or file addresses and
97       * corresponding generated Content-ID values.
98       */
99      private Vector sourceLinks = new Vector(0, 1);
100 
101     /***
102      * Enable/disable swapping resource references in html code with generated
103      * value for Content-ID message bodypart header line. Default value is true
104      * (enable swapping)
105      */
106     private boolean enableSwapping = true;
107 
108     /***
109      * Constructs HtmlAnalyzer from data given from InputStream. This constructor
110      * parses html code from input stream withouth swaping resources' locations from
111      * atribute's "src" and "background" value with generated Content-ID values. Also,
112      * it is performed generation of plain text message based on html code.
113      * @param content0 html code given as InputStream
114      * @SMIMEException caused by its private method analyze().
115      */
116     public HtmlAnalyzer(InputStream content0) throws SMIMEException {
117         Tidy tidy = new Tidy();
118 
119         tidy.setWraplen(1000);
120         tidy.setShowWarnings(false);
121         tidy.setUpperCaseTags(true);
122         doc = (tidy.parseDOM(content0, null));
123         enableSwapping = false;
124         analyze(doc);
125         plainText = plainText + "\r\n";
126     }
127 
128     /***
129      * Constructs HtmlAnalyzer from data given from InputStream. This constructor
130      * parses html code from input stream with swaping resources' locations from
131      * atribute's "src and "background" value with generated Content-ID values. In
132      * that process, it is used given second paremeter "path0" which represents
133      * common path to all resources in html code with relative path adresses. Also,
134      * it is performed generation of plain text message based on html code.
135      * @param content0 html code given as InputStream.
136      * @param path0 common path used for resolving all resources in html code with
137      * relative path adresses.
138      * @SMIMEException caused by its private method analyze().
139      */
140     public HtmlAnalyzer(InputStream content0, String path0) throws SMIMEException {
141         if (path0 != null) {
142             absolutPath = new String(path0);
143             if (absolutPath.charAt(absolutPath.length() - 1) == '//' ||
144                 absolutPath.charAt(absolutPath.length() - 1) == '/')
145                 absolutPath = absolutPath.substring(0, absolutPath.length() - 1);
146 
147             absolutPath = absolutPath.replace('/', File.separatorChar);
148             absolutPath = absolutPath.replace('//', File.separatorChar) + File.separator;
149         }
150 
151         Tidy tidy = new Tidy();
152 
153         tidy.setWraplen(1000);
154         tidy.setShowWarnings(false);
155         tidy.setUpperCaseTags(true);
156         doc = (tidy.parseDOM(content0, null));
157         analyze(doc);
158         plainText = plainText + "\r\n";
159     }
160 
161     /***
162      * Returns pairs of swapped resource URL adresses or File paths and appropriate
163      * generated Content IDs.
164      * @return Vector object whose even (and 0) indexes contain resource addresses
165      * as File or String objects, and whose odd indexes contain appropriate
166      * swapped Content-ID values.
167      */
168     public Vector getSwappedAdresses() {
169         return sourceLinks;
170     }
171 
172     /***
173      * Returns plain/text representation of given html code document
174      * @return html document transformed to plain/text.
175      */
176     public String getPlainText() {
177         return plainText;
178     }
179 
180     /***
181      * Returns html/text document passed throught JTidy html parser. All resource
182      * references which were accessible on the file system are swapped with
183      * generated content ID value. Also, all virtual references to appropriate
184      * InputStream resources (see setContent methods in classes from package
185      * org.webdocwf.util.smime.smime) are also swapped with generated Content-ID
186      * value.
187      * @return parsed html/text document.
188      * @exception SMIMEException caused by non SMIMEException which is:
189      * UnsupportedEncodingException.
190      */
191     public String getHtmlText() throws SMIMEException {
192         String returnString;
193 
194         Tidy tidy = new Tidy();
195 
196         tidy.setWraplen(1000);
197         ByteArrayOutputStream out = new ByteArrayOutputStream();
198 
199         tidy.pprint(doc, out);
200 
201         try {
202             returnString = out.toString("ISO-8859-1");
203             out.close();
204         } catch (Exception e) {
205             throw SMIMEException.getInstance(this, e, "getHtmlText");
206         }
207 
208         return returnString;
209     }
210 
211     /***
212      * Analyzes html code and creates alternate plain/text message from html code.
213      * Also, it creates Vector with corresponding pairs of resource locations discovered
214      * in html code (values of "background" and "src" attributes) and generated
215      * Content-ID values.
216      * @param node0 node element got from JTidy parser.
217      * @exception SMIMEException caused by MimeAssist.generateID() method or by
218      * its private method existenceOfResource().
219      */
220     private void analyze(Node node0) throws SMIMEException {
221 
222         if (node0 == null) {
223             return;
224         }
225         String brLine = "\r\n";
226         int type = node0.getNodeType();
227 
228         boolean pTagEnable_old = true;
229         int indent_old = 0;
230         int olNumber_old = 1;
231         String ul_ol_old = "";
232 
233         switch (type) {
234         case Node.DOCUMENT_NODE: // Document node
235             analyze(((Document) node0).getDocumentElement());
236             break;
237 
238         case Node.ELEMENT_NODE: // Element node
239             String elName = node0.getNodeName();
240 
241             if (elName.equalsIgnoreCase("br")) {
242                 plainText = plainText + brLine;
243                 if (indent > 0)
244                     plainText = plainText +
245                             indentString.substring(0, indent - 1);
246             } else if (elName.equalsIgnoreCase("hr")) {
247                 plainText = plainText + brLine +
248                         "==================================================" +
249                         brLine;
250             } else if (elName.equalsIgnoreCase("p")) {
251                 if (pTagEnable) {
252                     plainText = plainText + brLine + brLine;
253                     if (indent > 0)
254                         plainText = plainText +
255                                 indentString.substring(0, indent - 1);
256                 }
257                 pTagEnable = true;
258             } else if (elName.equalsIgnoreCase("ul")) {
259                 pTagEnable_old = pTagEnable;
260                 pTagEnable = false;
261                 ul_ol_old = ul_ol;
262                 ul_ol = elName;
263                 indent_old = indent;
264                 indent++;
265             } else if (elName.equalsIgnoreCase("ol")) {
266                 pTagEnable_old = pTagEnable;
267                 pTagEnable = false;
268                 ul_ol_old = ul_ol;
269                 ul_ol = elName;
270                 indent_old = indent;
271                 indent++;
272                 olNumber_old = olNumber;
273             } else if (elName.equalsIgnoreCase("li")) {
274                 pTagEnable = false;
275                 if (ul_ol.equalsIgnoreCase("ul")) {
276                     plainText = plainText + brLine +
277                             indentString.substring(0, indent - 1) +
278                             ">> ";
279                 } else if (ul_ol.equalsIgnoreCase("ol")) {
280                     plainText = plainText + brLine +
281                             indentString.substring(0, indent - 1) +
282                             olNumber + ". ";
283                     olNumber++;
284                 }
285             } else if (elName.equalsIgnoreCase("blockquote")) {
286                 pTagEnable_old = pTagEnable;
287                 pTagEnable = false;
288                 indent_old = indent;
289                 indent++;
290                 plainText = plainText + brLine +
291                         indentString.substring(0, indent);
292             } else if (elName.equalsIgnoreCase("q")) {
293                 pTagEnable_old = pTagEnable;
294                 pTagEnable = false;
295                 plainText = plainText + "\"";
296             } else if (elName.equalsIgnoreCase("table")) {
297                 plainText = plainText + brLine +
298                         "**************************************************" + brLine +
299                         "--------------------------------------------------" + brLine +
300                         "--  --  --  --  --  --  --  --  --  --  --  --  --" + brLine;
301             } else if (elName.equalsIgnoreCase("tr")) {
302                 plainText = plainText + brLine;
303             } else if (elName.equalsIgnoreCase("td")) {
304                 plainText = plainText + brLine;
305             }
306             // attributes handling
307             NamedNodeMap attrs = node0.getAttributes();
308 
309             for (int i = 0; i < attrs.getLength(); i++) {
310                 attrs.item(i).getNodeName().toUpperCase();
311                 if (enableSwapping &&
312                     ((attrs.item(i).getNodeName()).equalsIgnoreCase("src") ||
313                         (attrs.item(i).getNodeName()).equalsIgnoreCase("background"))) {
314                     String resource = attrs.item(i).getNodeValue();
315                     String cid = null;
316 
317                     //******nnn<virtual_file_name>   <-- resources got from byte array input stream
318                     if (resource.substring(0, 5).equalsIgnoreCase("*****")) {
319                         for (int j = 0; j < sourceLinks.size() & cid == null; j = j + 2) {
320                             if (sourceLinks.elementAt(j) instanceof String &&
321                                 ((String) sourceLinks.elementAt(j)).equals(resource))
322                                 cid = (String) sourceLinks.elementAt(j + 1);
323                         }
324                         if (cid == null) {
325                             cid = MimeAssist.generateID();
326                             sourceLinks.add(resource);
327                             sourceLinks.add(cid);
328                         }
329                         attrs.item(i).setNodeValue("cid:" + cid);
330                     } else {
331                         File fRes = existenceOfResource(resource);
332 
333                         if (fRes != null) {
334                             for (int j = 0; j < sourceLinks.size() & cid == null; j = j + 2) {
335                                 if (sourceLinks.elementAt(j) instanceof File &&
336                                     ((File) sourceLinks.elementAt(j)).compareTo(fRes) == 0)
337                                     cid = (String) sourceLinks.elementAt(j + 1);
338                             }
339                             if (cid == null) {
340                                 cid = MimeAssist.generateID();
341                                 sourceLinks.add(fRes);
342                                 sourceLinks.add(cid);
343                             }
344                             attrs.item(i).setNodeValue("cid:" + cid);
345                         }
346                     }
347                 }
348             }
349             // finish of opening particular element tag
350             NodeList children = node0.getChildNodes(); //Passing through the node tree
351 
352             if (children != null) {
353                 int len = children.getLength();
354 
355                 for (int i = 0; i < len; i++) {
356                     analyze(children.item(i));
357                 }
358             }
359             // start of closing particular element tag
360             if (elName.equalsIgnoreCase("ul")) {
361                 pTagEnable = pTagEnable_old;
362                 ul_ol = ul_ol_old;
363                 indent = indent_old;
364             } else if (elName.equalsIgnoreCase("ol")) {
365                 pTagEnable = pTagEnable_old;
366                 ul_ol = ul_ol_old;
367                 indent = indent_old;
368                 olNumber = olNumber_old;
369             } else if (elName.equalsIgnoreCase("table")) {
370                 plainText = plainText + brLine +
371                         "**************************************************";
372             } else if (elName.equalsIgnoreCase("tr")) {
373                 plainText = plainText + brLine +
374                         "--------------------------------------------------";
375             } else if (elName.equalsIgnoreCase("td")) {
376                 plainText = plainText + brLine +
377                         "--  --  --  --  --  --  --  --  --  --  --  --  --";
378             } else if (elName.equalsIgnoreCase("blockquote")) {
379                 indent = indent_old;
380                 pTagEnable = pTagEnable_old;
381             } else if (elName.equalsIgnoreCase("q")) {
382                 plainText = plainText + "\"";
383                 pTagEnable = pTagEnable_old;
384             }
385 
386             break;
387 
388         case Node.TEXT_NODE:
389             String nodeVal = node0.getNodeValue();
390 
391             plainText = plainText + nodeVal;
392             break;
393         }
394 
395     }
396 
397     /**
398      * Method checks if it is given a resource reachable in the destination file system.
399      * @param resource0 can be absolute or relative path with specified file name
400      * or adress of file in URL form (example "file:///c:/temp/example.gif" )
401      * @return object of class File which represents existance of the resource file
402      * or null if resource does not exist on the destination in file system.
403      * @SMIMEException caused by non SMIMEException which is IOException.
404      */
405     private File existenceOfResource(String resource0) throws SMIMEException {
406 
407         boolean resourceIsUrl = true;
408         String resource = new String(resource0);
409         URL url = null;
410 
411         try {
412             url = new URL(resource0);
413         } catch (MalformedURLException e) {
414             resourceIsUrl = false;
415         }
416 
417         if (resourceIsUrl == true && (!url.getProtocol().equalsIgnoreCase("file")))
418             return null;
419         else if (resourceIsUrl == true && url.getProtocol().equalsIgnoreCase("file")) {
420             resource = url.getFile();
421         }
422 
423         resource = replaceHex(resource);
424         resource = resource.replace('/', File.separatorChar);
425         resource = resource.replace('//', File.separatorChar);
426         File fRes = new File(resource);
427 
428         try {
429             if (fRes.exists())
430                 return fRes.getAbsoluteFile().getCanonicalFile();
431 
432             fRes = new File(absolutPath + resource);
433             if (fRes.exists())
434                 return fRes.getAbsoluteFile().getCanonicalFile();
435 
436             fRes = new File(absolutPath + resource);
437             if (fRes.exists())
438                 return fRes.getAbsoluteFile().getCanonicalFile();
439         } catch (Exception e) {
440             throw SMIMEException.getInstance(this, e, "existenceOfResource");
441         }
442 
443         return null;
444     }
445 
446     /***
447      * Replaces possible hexadecimal representation of blank characters (presented
448      * with "%20") from resource String representation, with blank character.
449      * @param resources0 resource which is examined for hex representation of blank
450      * characters.
451      * @return String with replaced hexadecimal representation of blank characters.
452      */
453     private String replaceHex(String resources0) {
454         while (resources0.indexOf("%20") != -1) {
455             resources0 = resources0.substring(0, resources0.indexOf("%20")) + " " +
456                     resources0.substring(resources0.indexOf("%20") + 3);
457         }
458         return resources0;
459     }
460 
461 }
This page was automatically generated by Maven