rita
Class RiHtmlParser

java.lang.Object
  extended by rita.RiHtmlParser
Direct Known Subclasses:
RiDataStore

public class RiHtmlParser
extends java.lang.Object

Provides various utility functions for fetching and parsing text data from web pages using either the Document-Object-Model (DOM) or regular expressions.

Parses an HTML document and returns the html text, with or without the HTML tags stripped. Can also be used for custom parsing, as in the fetchLinks() and fetchLinkText() methods (see example below.)

Simple Examples:

 RiHtmlParser rhp = new RiHtmlParser();
 System.out.println(rhp.fetch("http://www.google.com")); // simple fetch
 // -------------------------------------------------------------------
 System.out.println(rhp.fetch("http://www.google.com", true)); // fetch & strip
 // -------------------------------------------------------------------
 String[] links = rhp.fetchLinks("http://www.google.com"); // get links
 for (int i = 0; i < links.length; i++)
   // & print 'em
   System.out.println(i + ") " + links[i]); // one by one
 // -------------------------------------------------------------------
 System.out.println(rhp.parse("http://www.google.com")); // an empty parse
 

Also provides a base implementation so that subclasses can override the handleText(), handleSimpleTag(), handleStartTag(), and handleEndTag(), methods to define custom behavior (as below and in RiGoogleParser).

An example of a custom parse to retrieve all linked text:

      final List links = new ArrayList();
      rhp.customParse(new URL("http://www.google.com"), 
        new HTMLEditorKit.ParserCallback() // an inner class
        {
          boolean isLink = false;
          public void handleStartTag(Tag t, MutableAttributeSet a, int pos) {
            if (t == Tag.A) isLink = true;
          }
          public void handleText(char[] data, int pos) {
            if (isLink) links.add(new String(data));        
          }
          public void handleEndTag(Tag t, int pos) {
            if (t == Tag.A) isLink = false;
          }
        }
      ));
      
      // print out the link texts that we found 
      for (int i = 0; i < links.size(); i++) {
        System.out.println(i+") "+links.get(i));
      }
 


Field Summary
static java.lang.String DEFAULT_CHARSET
           
static int DEFAULT_CONNECT_TIMEOUT
           
static int DEFAULT_READ_TIMEOUT
           
static java.lang.String DEFAULT_USER_AGENT
           
 
Constructor Summary
RiHtmlParser()
          Deprecated.  
RiHtmlParser(processing.core.PApplet p)
           
 
Method Summary
 void customParse(java.lang.String url, javax.swing.text.html.HTMLEditorKit.ParserCallback parserCallback)
           
 java.lang.String fetch(java.lang.String url)
           
 java.lang.String fetch(java.lang.String url, boolean stripTags)
           
 java.lang.String fetch(java.lang.String url, boolean stripTags, int connectionTimeout)
           
 java.lang.String fetch(java.net.URL url)
           
 java.awt.Image fetchImage(java.lang.String url)
           
 java.lang.String[] fetchLinks(java.lang.String url)
           
 java.lang.String[] fetchLinkText(java.lang.String url)
           
 processing.core.PImage fetchPImage(processing.core.PApplet pApplet, java.lang.String url)
           
 int getConnectTimeout()
           
 int getReadTimeout()
           
 java.lang.String getUserAgent()
           
static void main(java.lang.String[] args)
           
 java.lang.String post(java.lang.String url, java.util.Map keyValuePairs)
           
 java.lang.String post(java.net.URL url, java.util.Map keyValuePairs)
           
 void setConnectTimeout(int connectTimeout)
           
 void setReadTimeout(int readTimeout)
           
 void setUserAgent(java.lang.String userAgent)
           
 
Methods inherited from class java.lang.Object
equals, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
 

Field Detail

DEFAULT_USER_AGENT

public static final java.lang.String DEFAULT_USER_AGENT
See Also:
Constant Field Values

DEFAULT_CHARSET

public static java.lang.String DEFAULT_CHARSET

DEFAULT_CONNECT_TIMEOUT

public static int DEFAULT_CONNECT_TIMEOUT

DEFAULT_READ_TIMEOUT

public static int DEFAULT_READ_TIMEOUT
Constructor Detail

RiHtmlParser

public RiHtmlParser()
Deprecated. 

Invisible:

RiHtmlParser

public RiHtmlParser(processing.core.PApplet p)
Method Detail

fetch

public java.lang.String fetch(java.net.URL url)

fetchImage

public java.awt.Image fetchImage(java.lang.String url)

fetchPImage

public processing.core.PImage fetchPImage(processing.core.PApplet pApplet,
                                          java.lang.String url)

post

public java.lang.String post(java.lang.String url,
                             java.util.Map keyValuePairs)

post

public java.lang.String post(java.net.URL url,
                             java.util.Map keyValuePairs)

customParse

public void customParse(java.lang.String url,
                        javax.swing.text.html.HTMLEditorKit.ParserCallback parserCallback)

fetch

public java.lang.String fetch(java.lang.String url)

fetch

public java.lang.String fetch(java.lang.String url,
                              boolean stripTags)

fetch

public java.lang.String fetch(java.lang.String url,
                              boolean stripTags,
                              int connectionTimeout)

fetchLinkText

public java.lang.String[] fetchLinkText(java.lang.String url)

fetchLinks

public java.lang.String[] fetchLinks(java.lang.String url)

getUserAgent

public java.lang.String getUserAgent()

setUserAgent

public void setUserAgent(java.lang.String userAgent)

getReadTimeout

public int getReadTimeout()

setReadTimeout

public void setReadTimeout(int readTimeout)

getConnectTimeout

public int getConnectTimeout()

setConnectTimeout

public void setConnectTimeout(int connectTimeout)

main

public static void main(java.lang.String[] args)