Untitled

 avatar
unknown
plain_text
2 years ago
7.1 kB
5
Indexable
 package org.jsoup.examples;

 2 	  	

 

 3 	  	

 import org.jsoup.Jsoup;

 4 	  	

 import org.jsoup.helper.StringUtil;

 5 	  	

 import org.jsoup.helper.Validate;

 6 	  	

 import org.jsoup.nodes.Document;

 7 	  	

 import org.jsoup.nodes.Element;

 8 	  	

 import org.jsoup.nodes.Node;

 9 	  	

 import org.jsoup.nodes.TextNode;

 10 	  	

 import org.jsoup.select.Elements;

 11 	  	

 import org.jsoup.select.NodeTraversor;

 12 	  	

 import org.jsoup.select.NodeVisitor;

 13 	  	

 

 14 	  	

 import java.io.IOException;

 15 	  	

 

 16 	  	

 /**

 17 	  	

  * HTML to plain-text. This example program demonstrates the use of jsoup to convert HTML input to lightly-formatted

 18 	  	

  * plain-text. That is divergent from the general goal of jsoup's .text() methods, which is to get clean data from a

 19 	  	

  * scrape.

 20 	  	

  * <p>

 21 	  	

  * Note that this is a fairly simplistic formatter -- for real world use you'll want to embrace and extend.

 22 	  	

  * </p>

 23 	  	

  * <p>

 24 	  	

  * To invoke from the command line, assuming you've downloaded the jsoup jar to your current directory:</p>

 25 	  	

  * <p><code>java -cp jsoup.jar org.jsoup.examples.HtmlToPlainText url [selector]</code></p>

 26 	  	

  * where <i>url</i> is the URL to fetch, and <i>selector</i> is an optional CSS selector.

 27 	  	

  * 

 28 	  	

  * @author Jonathan Hedley, jonathan@hedley.net

 29 	  	

  */

 30 	 0 	

 public class HtmlToPlainText {

 31 	  	

     private static final String userAgent = "Mozilla/5.0 (jsoup)";

 32 	  	

     private static final int timeout = 5 * 1000;

 33 	  	

 

 34 	  	

     public static void main(String... args) throws IOException {

 35 	 0 	

         Validate.isTrue(args.length == 1 || args.length == 2, "usage: java -cp jsoup.jar org.jsoup.examples.HtmlToPlainText url [selector]");

 36 	 0 	

         final String url = args[0];

 37 	 0 	

         final String selector = args.length == 2 ? args[1] : null;

 38 	  	

 

 39 	  	

         // fetch the specified URL and parse to a HTML DOM

 40 	 0 	

         Document doc = Jsoup.connect(url).userAgent(userAgent).timeout(timeout).get();

 41 	  	

 

 42 	 0 	

         HtmlToPlainText formatter = new HtmlToPlainText();

 43 	  	

 

 44 	 0 	

         if (selector != null) {

 45 	 0 	

             Elements elements = doc.select(selector); // get each element that matches the CSS selector

 46 	 0 	

             for (Element element : elements) {

 47 	 0 	

                 String plainText = formatter.getPlainText(element); // format that element to plain text

 48 	 0 	

                 System.out.println(plainText);

 49 	 0 	

             }

 50 	 0 	

         } else { // format the whole doc

 51 	 0 	

             String plainText = formatter.getPlainText(doc);

 52 	 0 	

             System.out.println(plainText);

 53 	  	

         }

 54 	 0 	

     }

 55 	  	

 

 56 	  	

     /**

 57 	  	

      * Format an Element to plain-text

 58 	  	

      * @param element the root element to format

 59 	  	

      * @return formatted text

 60 	  	

      */

 61 	  	

     public String getPlainText(Element element) {

 62 	 0 	

         FormattingVisitor formatter = new FormattingVisitor();

 63 	 0 	

         NodeTraversor.traverse(formatter, element); // walk the DOM, and call .head() and .tail() for each node

 64 	  	

 

 65 	 0 	

         return formatter.toString();

 66 	  	

     }

 67 	  	

 

 68 	  	

     // the formatting rules, implemented in a breadth-first DOM traverse

 69 	 0 	

     private class FormattingVisitor implements NodeVisitor {

 70 	  	

         private static final int maxWidth = 80;

 71 	 0 	

         private int width = 0;

 72 	 0 	

         private StringBuilder accum = new StringBuilder(); // holds the accumulated text

 73 	  	

 

 74 	  	

         // hit when the node is first seen

 75 	  	

         public void head(Node node, int depth) {

 76 	 0 	

             String name = node.nodeName();

 77 	 0 	

             if (node instanceof TextNode)

 78 	 0 	

                 append(((TextNode) node).text()); // TextNodes carry all user-readable text in the DOM.

 79 	 0 	

             else if (name.equals("li"))

 80 	 0 	

                 append("\n * ");

 81 	 0 	

             else if (name.equals("dt"))

 82 	 0 	

                 append("  ");

 83 	 0 	

             else if (StringUtil.in(name, "p", "h1", "h2", "h3", "h4", "h5", "tr"))

 84 	 0 	

                 append("\n");

 85 	 0 	

         }

 86 	  	

 

 87 	  	

         // hit when all of the node's children (if any) have been visited

 88 	  	

         public void tail(Node node, int depth) {

 89 	 0 	

             String name = node.nodeName();

 90 	 0 	

             if (StringUtil.in(name, "br", "dd", "dt", "p", "h1", "h2", "h3", "h4", "h5"))

 91 	 0 	

                 append("\n");

 92 	 0 	

             else if (name.equals("a"))

 93 	 0 	

                 append(String.format(" <%s>", node.absUrl("href")));

 94 	 0 	

         }

 95 	  	

 

 96 	  	

         // appends text to the string builder with a simple word wrap method

 97 	  	

         private void append(String text) {

 98 	 0 	

             if (text.startsWith("\n"))

 99 	 0 	

                 width = 0; // reset counter if starts with a newline. only from formats above, not in natural text

 100 	 0 	

             if (text.equals(" ") &&

 101 	 0 	

                     (accum.length() == 0 || StringUtil.in(accum.substring(accum.length() - 1), " ", "\n")))

 102 	 0 	

                 return; // don't accumulate long runs of empty spaces

 103 	  	

 

 104 	 0 	

             if (text.length() + width > maxWidth) { // won't fit, needs to wrap

 105 	 0 	

                 String words[] = text.split("\\s+");

 106 	 0 	

                 for (int i = 0; i < words.length; i++) {

 107 	 0 	

                     String word = words[i];

 108 	 0 	

                     boolean last = i == words.length - 1;

 109 	 0 	

                     if (!last) // insert a space if not the last word

 110 	 0 	

                         word = word + " ";

 111 	 0 	

                     if (word.length() + width > maxWidth) { // wrap and reset counter

 112 	 0 	

                         accum.append("\n").append(word);

 113 	 0 	

                         width = word.length();

 114 	  	

                     } else {

 115 	 0 	

                         accum.append(word);

 116 	 0 	

                         width += word.length();

 117 	  	

                     }

 118 	  	

                 }

 119 	 0 	

             } else { // fits as is, without need to wrap text

 120 	 0 	

                 accum.append(text);

 121 	 0 	

                 width += text.length();

 122 	  	

             }

 123 	 0 	

         }

 124 	  	

 

 125 	  	

         @Override

 126 	  	

         public String toString() {

 127 	 0 	

             return accum.toString();

 128 	  	

         }

 129 	  	

     }

 130 	  	

 }
Editor is loading...