rita
Class RiHtmlParser
java.lang.Object
rita.RiHtmlParser
- Direct Known Subclasses:
- RiDataStore
public class RiHtmlParser
- extends java.lang.Object
Provides various utility functions for fetching and parsing text data from
web pages using either the Document-Object-Model (DOM) or regular
expressions.
Parses an HTML document and returns the html text, with or without the HTML
tags stripped. Can also be used for custom parsing, as in the fetchLinks()
and fetchLinkText() methods (see example below.)
Simple Examples:
RiHtmlParser rhp = new RiHtmlParser();
System.out.println(rhp.fetch("http://www.google.com")); // simple fetch
// -------------------------------------------------------------------
System.out.println(rhp.fetch("http://www.google.com", true)); // fetch & strip
// -------------------------------------------------------------------
String[] links = rhp.fetchLinks("http://www.google.com"); // get links
for (int i = 0; i < links.length; i++)
// & print 'em
System.out.println(i + ") " + links[i]); // one by one
// -------------------------------------------------------------------
System.out.println(rhp.parse("http://www.google.com")); // an empty parse
Also provides a base implementation so that subclasses can override the
handleText(), handleSimpleTag(), handleStartTag(), and handleEndTag(),
methods to define custom behavior (as below and in RiGoogleParser).
An example of a custom parse to retrieve all linked text:
final List links = new ArrayList();
rhp.customParse(new URL("http://www.google.com"),
new HTMLEditorKit.ParserCallback() // an inner class
{
boolean isLink = false;
public void handleStartTag(Tag t, MutableAttributeSet a, int pos) {
if (t == Tag.A) isLink = true;
}
public void handleText(char[] data, int pos) {
if (isLink) links.add(new String(data));
}
public void handleEndTag(Tag t, int pos) {
if (t == Tag.A) isLink = false;
}
}
));
// print out the link texts that we found
for (int i = 0; i < links.size(); i++) {
System.out.println(i+") "+links.get(i));
}
|
Method Summary |
void |
customParse(java.lang.String url,
javax.swing.text.html.HTMLEditorKit.ParserCallback parserCallback)
|
java.lang.String |
fetch(java.lang.String url)
|
java.lang.String |
fetch(java.lang.String url,
boolean stripTags)
|
java.lang.String |
fetch(java.lang.String url,
boolean stripTags,
int connectionTimeout)
|
java.lang.String |
fetch(java.net.URL url)
|
java.awt.Image |
fetchImage(java.lang.String url)
|
java.lang.String[] |
fetchLinks(java.lang.String url)
|
java.lang.String[] |
fetchLinkText(java.lang.String url)
|
processing.core.PImage |
fetchPImage(processing.core.PApplet pApplet,
java.lang.String url)
|
int |
getConnectTimeout()
|
int |
getReadTimeout()
|
java.lang.String |
getUserAgent()
|
static void |
main(java.lang.String[] args)
|
java.lang.String |
post(java.lang.String url,
java.util.Map keyValuePairs)
|
java.lang.String |
post(java.net.URL url,
java.util.Map keyValuePairs)
|
void |
setConnectTimeout(int connectTimeout)
|
void |
setReadTimeout(int readTimeout)
|
void |
setUserAgent(java.lang.String userAgent)
|
| Methods inherited from class java.lang.Object |
equals, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait |
DEFAULT_USER_AGENT
public static final java.lang.String DEFAULT_USER_AGENT
- See Also:
- Constant Field Values
DEFAULT_CHARSET
public static java.lang.String DEFAULT_CHARSET
DEFAULT_CONNECT_TIMEOUT
public static int DEFAULT_CONNECT_TIMEOUT
DEFAULT_READ_TIMEOUT
public static int DEFAULT_READ_TIMEOUT
RiHtmlParser
public RiHtmlParser()
- Deprecated.
- Invisible:
RiHtmlParser
public RiHtmlParser(processing.core.PApplet p)
fetch
public java.lang.String fetch(java.net.URL url)
fetchImage
public java.awt.Image fetchImage(java.lang.String url)
fetchPImage
public processing.core.PImage fetchPImage(processing.core.PApplet pApplet,
java.lang.String url)
post
public java.lang.String post(java.lang.String url,
java.util.Map keyValuePairs)
post
public java.lang.String post(java.net.URL url,
java.util.Map keyValuePairs)
customParse
public void customParse(java.lang.String url,
javax.swing.text.html.HTMLEditorKit.ParserCallback parserCallback)
fetch
public java.lang.String fetch(java.lang.String url)
fetch
public java.lang.String fetch(java.lang.String url,
boolean stripTags)
fetch
public java.lang.String fetch(java.lang.String url,
boolean stripTags,
int connectionTimeout)
fetchLinkText
public java.lang.String[] fetchLinkText(java.lang.String url)
fetchLinks
public java.lang.String[] fetchLinks(java.lang.String url)
getUserAgent
public java.lang.String getUserAgent()
setUserAgent
public void setUserAgent(java.lang.String userAgent)
getReadTimeout
public int getReadTimeout()
setReadTimeout
public void setReadTimeout(int readTimeout)
getConnectTimeout
public int getConnectTimeout()
setConnectTimeout
public void setConnectTimeout(int connectTimeout)
main
public static void main(java.lang.String[] args)