1 /***
2 * Title: S/MIME Project
3 * Description: Creating S/MIME email transport capabilities.
4 * Copyright: Copyright (c) 2001
5 * @Author Vladimir Radisic
6 * @Version 2.0.1
7 */
8
9 package org.webdocwf.util.smime.util;
10
11
12 import org.webdocwf.util.smime.exception.SMIMEException;
13 import java.util.Vector;
14 import java.net.MalformedURLException;
15 import java.net.URL;
16 import java.io.InputStream;
17 import java.io.ByteArrayOutputStream;
18 import java.io.File;
19 import org.w3c.dom.Attr;
20 import org.w3c.dom.Document;
21 import org.w3c.dom.NamedNodeMap;
22 import org.w3c.dom.Node;
23 import org.w3c.dom.NodeList;
24 import org.w3c.tidy.Tidy;
25
26
27 /***
28 * HtmlAnalyzer class is used for parsing html code which has to become content
29 * of the message. For parsing is used JTidy parser. As result of parsing, DOM
30 * (Document Object Model) structure is obtained. It is tree-like construction
31 * with nodes and hierarchical structures that descript input html code. This
32 * structure is easy for browsing and searching for specific html elements and
33 * attributes. By using DOM, all references to resources (image, movie, sound... ),
34 * defined in "src" and "background" attributes, are explored and swapped with
35 * generated unique Content-ID values which are necessary in forming
36 * "multipart/related" MimeMultipart object.<BR>
37 * <BR>
38 * DOM, generated inside of the object of this class, is also used in the process of
39 * generation plain/text message based on, and derived from the given html code.
40 * This plain text is later used in creation of "multipart/alternative"
41 * MimeMultipart object.
42 */
43 public class HtmlAnalyzer {
44
45 /***
46 * plain/text representation of page
47 */
48 private String plainText = "";
49
50 /***
51 * Enable/disable p tag in text/html to text/plain conversion.
52 */
53 private boolean pTagEnable = true;
54
55 /***
56 * Path to html file or prefix path to the embeded resource's adresses in
57 * html code (for example for "src" attribute of IMG tag). Can be null which
58 * means that prefix won't be added to resources location in the process of
59 * searching for specific adress attributes given in html code.
60 */
61 private String absolutPath = null;
62
63 /***
64 * Container for parsed html document in DOM (Document Object Model)
65 * representation.
66 */
67 private Document doc;
68
69 /***
70 * Indent from left margin pointer. This information is used in the process of
71 * generation plain text message based on html code.
72 */
73 private int indent = 0;
74
75 /***
76 * Current sequential number of OL (ordered list) html element. This information
77 * is used in the process of generation plain text message based on html code.
78 */
79 private int olNumber = 1;
80
81 /***
82 * Current html element is OL (ordered list), UN (unordered list) or something
83 * else. This information is used in the process of generation plain text message based
84 * on html code.
85 */
86 private String ul_ol = "";
87
88 /***
89 * Constant used in generating indent from left side. This information is used in
90 * the process of generation plain text message based on html code.
91 */
92 private final String indentString =
93 "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t";
94
95 /***
96 * Container for storing pairs of replaced url or file addresses and
97 * corresponding generated Content-ID values.
98 */
99 private Vector sourceLinks = new Vector(0, 1);
100
101 /***
102 * Enable/disable swapping resource references in html code with generated
103 * value for Content-ID message bodypart header line. Default value is true
104 * (enable swapping)
105 */
106 private boolean enableSwapping = true;
107
108 /***
109 * Constructs HtmlAnalyzer from data given from InputStream. This constructor
110 * parses html code from input stream withouth swaping resources' locations from
111 * atribute's "src" and "background" value with generated Content-ID values. Also,
112 * it is performed generation of plain text message based on html code.
113 * @param content0 html code given as InputStream
114 * @SMIMEException caused by its private method analyze().
115 */
116 public HtmlAnalyzer(InputStream content0) throws SMIMEException {
117 Tidy tidy = new Tidy();
118
119 tidy.setWraplen(1000);
120 tidy.setShowWarnings(false);
121 tidy.setUpperCaseTags(true);
122 doc = (tidy.parseDOM(content0, null));
123 enableSwapping = false;
124 analyze(doc);
125 plainText = plainText + "\r\n";
126 }
127
128 /***
129 * Constructs HtmlAnalyzer from data given from InputStream. This constructor
130 * parses html code from input stream with swaping resources' locations from
131 * atribute's "src and "background" value with generated Content-ID values. In
132 * that process, it is used given second paremeter "path0" which represents
133 * common path to all resources in html code with relative path adresses. Also,
134 * it is performed generation of plain text message based on html code.
135 * @param content0 html code given as InputStream.
136 * @param path0 common path used for resolving all resources in html code with
137 * relative path adresses.
138 * @SMIMEException caused by its private method analyze().
139 */
140 public HtmlAnalyzer(InputStream content0, String path0) throws SMIMEException {
141 if (path0 != null) {
142 absolutPath = new String(path0);
143 if (absolutPath.charAt(absolutPath.length() - 1) == '//' ||
144 absolutPath.charAt(absolutPath.length() - 1) == '/')
145 absolutPath = absolutPath.substring(0, absolutPath.length() - 1);
146
147 absolutPath = absolutPath.replace('/', File.separatorChar);
148 absolutPath = absolutPath.replace('//', File.separatorChar) + File.separator;
149 }
150
151 Tidy tidy = new Tidy();
152
153 tidy.setWraplen(1000);
154 tidy.setShowWarnings(false);
155 tidy.setUpperCaseTags(true);
156 doc = (tidy.parseDOM(content0, null));
157 analyze(doc);
158 plainText = plainText + "\r\n";
159 }
160
161 /***
162 * Returns pairs of swapped resource URL adresses or File paths and appropriate
163 * generated Content IDs.
164 * @return Vector object whose even (and 0) indexes contain resource addresses
165 * as File or String objects, and whose odd indexes contain appropriate
166 * swapped Content-ID values.
167 */
168 public Vector getSwappedAdresses() {
169 return sourceLinks;
170 }
171
172 /***
173 * Returns plain/text representation of given html code document
174 * @return html document transformed to plain/text.
175 */
176 public String getPlainText() {
177 return plainText;
178 }
179
180 /***
181 * Returns html/text document passed throught JTidy html parser. All resource
182 * references which were accessible on the file system are swapped with
183 * generated content ID value. Also, all virtual references to appropriate
184 * InputStream resources (see setContent methods in classes from package
185 * org.webdocwf.util.smime.smime) are also swapped with generated Content-ID
186 * value.
187 * @return parsed html/text document.
188 * @exception SMIMEException caused by non SMIMEException which is:
189 * UnsupportedEncodingException.
190 */
191 public String getHtmlText() throws SMIMEException {
192 String returnString;
193
194 Tidy tidy = new Tidy();
195
196 tidy.setWraplen(1000);
197 ByteArrayOutputStream out = new ByteArrayOutputStream();
198
199 tidy.pprint(doc, out);
200
201 try {
202 returnString = out.toString("ISO-8859-1");
203 out.close();
204 } catch (Exception e) {
205 throw SMIMEException.getInstance(this, e, "getHtmlText");
206 }
207
208 return returnString;
209 }
210
211 /***
212 * Analyzes html code and creates alternate plain/text message from html code.
213 * Also, it creates Vector with corresponding pairs of resource locations discovered
214 * in html code (values of "background" and "src" attributes) and generated
215 * Content-ID values.
216 * @param node0 node element got from JTidy parser.
217 * @exception SMIMEException caused by MimeAssist.generateID() method or by
218 * its private method existenceOfResource().
219 */
220 private void analyze(Node node0) throws SMIMEException {
221
222 if (node0 == null) {
223 return;
224 }
225 String brLine = "\r\n";
226 int type = node0.getNodeType();
227
228 boolean pTagEnable_old = true;
229 int indent_old = 0;
230 int olNumber_old = 1;
231 String ul_ol_old = "";
232
233 switch (type) {
234 case Node.DOCUMENT_NODE: // Document node
235 analyze(((Document) node0).getDocumentElement());
236 break;
237
238 case Node.ELEMENT_NODE: // Element node
239 String elName = node0.getNodeName();
240
241 if (elName.equalsIgnoreCase("br")) {
242 plainText = plainText + brLine;
243 if (indent > 0)
244 plainText = plainText +
245 indentString.substring(0, indent - 1);
246 } else if (elName.equalsIgnoreCase("hr")) {
247 plainText = plainText + brLine +
248 "==================================================" +
249 brLine;
250 } else if (elName.equalsIgnoreCase("p")) {
251 if (pTagEnable) {
252 plainText = plainText + brLine + brLine;
253 if (indent > 0)
254 plainText = plainText +
255 indentString.substring(0, indent - 1);
256 }
257 pTagEnable = true;
258 } else if (elName.equalsIgnoreCase("ul")) {
259 pTagEnable_old = pTagEnable;
260 pTagEnable = false;
261 ul_ol_old = ul_ol;
262 ul_ol = elName;
263 indent_old = indent;
264 indent++;
265 } else if (elName.equalsIgnoreCase("ol")) {
266 pTagEnable_old = pTagEnable;
267 pTagEnable = false;
268 ul_ol_old = ul_ol;
269 ul_ol = elName;
270 indent_old = indent;
271 indent++;
272 olNumber_old = olNumber;
273 } else if (elName.equalsIgnoreCase("li")) {
274 pTagEnable = false;
275 if (ul_ol.equalsIgnoreCase("ul")) {
276 plainText = plainText + brLine +
277 indentString.substring(0, indent - 1) +
278 ">> ";
279 } else if (ul_ol.equalsIgnoreCase("ol")) {
280 plainText = plainText + brLine +
281 indentString.substring(0, indent - 1) +
282 olNumber + ". ";
283 olNumber++;
284 }
285 } else if (elName.equalsIgnoreCase("blockquote")) {
286 pTagEnable_old = pTagEnable;
287 pTagEnable = false;
288 indent_old = indent;
289 indent++;
290 plainText = plainText + brLine +
291 indentString.substring(0, indent);
292 } else if (elName.equalsIgnoreCase("q")) {
293 pTagEnable_old = pTagEnable;
294 pTagEnable = false;
295 plainText = plainText + "\"";
296 } else if (elName.equalsIgnoreCase("table")) {
297 plainText = plainText + brLine +
298 "**************************************************" + brLine +
299 "--------------------------------------------------" + brLine +
300 "-- -- -- -- -- -- -- -- -- -- -- -- --" + brLine;
301 } else if (elName.equalsIgnoreCase("tr")) {
302 plainText = plainText + brLine;
303 } else if (elName.equalsIgnoreCase("td")) {
304 plainText = plainText + brLine;
305 }
306 // attributes handling
307 NamedNodeMap attrs = node0.getAttributes();
308
309 for (int i = 0; i < attrs.getLength(); i++) {
310 attrs.item(i).getNodeName().toUpperCase();
311 if (enableSwapping &&
312 ((attrs.item(i).getNodeName()).equalsIgnoreCase("src") ||
313 (attrs.item(i).getNodeName()).equalsIgnoreCase("background"))) {
314 String resource = attrs.item(i).getNodeValue();
315 String cid = null;
316
317 //******nnn<virtual_file_name> <-- resources got from byte array input stream
318 if (resource.substring(0, 5).equalsIgnoreCase("*****")) {
319 for (int j = 0; j < sourceLinks.size() & cid == null; j = j + 2) {
320 if (sourceLinks.elementAt(j) instanceof String &&
321 ((String) sourceLinks.elementAt(j)).equals(resource))
322 cid = (String) sourceLinks.elementAt(j + 1);
323 }
324 if (cid == null) {
325 cid = MimeAssist.generateID();
326 sourceLinks.add(resource);
327 sourceLinks.add(cid);
328 }
329 attrs.item(i).setNodeValue("cid:" + cid);
330 } else {
331 File fRes = existenceOfResource(resource);
332
333 if (fRes != null) {
334 for (int j = 0; j < sourceLinks.size() & cid == null; j = j + 2) {
335 if (sourceLinks.elementAt(j) instanceof File &&
336 ((File) sourceLinks.elementAt(j)).compareTo(fRes) == 0)
337 cid = (String) sourceLinks.elementAt(j + 1);
338 }
339 if (cid == null) {
340 cid = MimeAssist.generateID();
341 sourceLinks.add(fRes);
342 sourceLinks.add(cid);
343 }
344 attrs.item(i).setNodeValue("cid:" + cid);
345 }
346 }
347 }
348 }
349 // finish of opening particular element tag
350 NodeList children = node0.getChildNodes(); //Passing through the node tree
351
352 if (children != null) {
353 int len = children.getLength();
354
355 for (int i = 0; i < len; i++) {
356 analyze(children.item(i));
357 }
358 }
359 // start of closing particular element tag
360 if (elName.equalsIgnoreCase("ul")) {
361 pTagEnable = pTagEnable_old;
362 ul_ol = ul_ol_old;
363 indent = indent_old;
364 } else if (elName.equalsIgnoreCase("ol")) {
365 pTagEnable = pTagEnable_old;
366 ul_ol = ul_ol_old;
367 indent = indent_old;
368 olNumber = olNumber_old;
369 } else if (elName.equalsIgnoreCase("table")) {
370 plainText = plainText + brLine +
371 "**************************************************";
372 } else if (elName.equalsIgnoreCase("tr")) {
373 plainText = plainText + brLine +
374 "--------------------------------------------------";
375 } else if (elName.equalsIgnoreCase("td")) {
376 plainText = plainText + brLine +
377 "-- -- -- -- -- -- -- -- -- -- -- -- --";
378 } else if (elName.equalsIgnoreCase("blockquote")) {
379 indent = indent_old;
380 pTagEnable = pTagEnable_old;
381 } else if (elName.equalsIgnoreCase("q")) {
382 plainText = plainText + "\"";
383 pTagEnable = pTagEnable_old;
384 }
385
386 break;
387
388 case Node.TEXT_NODE:
389 String nodeVal = node0.getNodeValue();
390
391 plainText = plainText + nodeVal;
392 break;
393 }
394
395 }
396
397 /**
398 * Method checks if it is given a resource reachable in the destination file system.
399 * @param resource0 can be absolute or relative path with specified file name
400 * or adress of file in URL form (example "file:///c:/temp/example.gif" )
401 * @return object of class File which represents existance of the resource file
402 * or null if resource does not exist on the destination in file system.
403 * @SMIMEException caused by non SMIMEException which is IOException.
404 */
405 private File existenceOfResource(String resource0) throws SMIMEException {
406
407 boolean resourceIsUrl = true;
408 String resource = new String(resource0);
409 URL url = null;
410
411 try {
412 url = new URL(resource0);
413 } catch (MalformedURLException e) {
414 resourceIsUrl = false;
415 }
416
417 if (resourceIsUrl == true && (!url.getProtocol().equalsIgnoreCase("file")))
418 return null;
419 else if (resourceIsUrl == true && url.getProtocol().equalsIgnoreCase("file")) {
420 resource = url.getFile();
421 }
422
423 resource = replaceHex(resource);
424 resource = resource.replace('/', File.separatorChar);
425 resource = resource.replace('//', File.separatorChar);
426 File fRes = new File(resource);
427
428 try {
429 if (fRes.exists())
430 return fRes.getAbsoluteFile().getCanonicalFile();
431
432 fRes = new File(absolutPath + resource);
433 if (fRes.exists())
434 return fRes.getAbsoluteFile().getCanonicalFile();
435
436 fRes = new File(absolutPath + resource);
437 if (fRes.exists())
438 return fRes.getAbsoluteFile().getCanonicalFile();
439 } catch (Exception e) {
440 throw SMIMEException.getInstance(this, e, "existenceOfResource");
441 }
442
443 return null;
444 }
445
446 /***
447 * Replaces possible hexadecimal representation of blank characters (presented
448 * with "%20") from resource String representation, with blank character.
449 * @param resources0 resource which is examined for hex representation of blank
450 * characters.
451 * @return String with replaced hexadecimal representation of blank characters.
452 */
453 private String replaceHex(String resources0) {
454 while (resources0.indexOf("%20") != -1) {
455 resources0 = resources0.substring(0, resources0.indexOf("%20")) + " " +
456 resources0.substring(resources0.indexOf("%20") + 3);
457 }
458 return resources0;
459 }
460
461 }
This page was automatically generated by Maven