// This file defines a class that offers a simple interface to Web pages for // Java programmers. // History: // // December 2002 -- Made more accepting of what content types it will // read, in particular should now accept any sort of text. // // October 2002 -- This version created by Doug Baldwin, from an older // version that had a tendency to not actually read whole pages, as // well as some features specific to course exercises done with the // class. This version hopefully fixes the bug, and makes the class // more general. Also adds javadoc documentation. // // Oct. 2001 -- Original version created by Doug Baldwin. package geneseo.cs.sc; import java.net.*; import java.io.*; /** * Represents a Web page, i.e., a long string of text that (presumably) * lives on a Web server somewhere and is identified by a URL. This class * provides a much simpler, but less functional, interface to Web pages * than the standard Java library classes do. */ public class WebPage { // Web page objects record the Java URL object for their page, and the contents // of that page. private URL myURL; private String content; /** * Initialize a Web page. For example *
* WebPage page = new WebPage( "http://www.somewhere.com" );
*
* @param url The URL for the page this object should represent.
*/
// This works by creating a Java URL object for this page, and then reading
// its content. If the content is text, then it is saved in a member variable
// for later. I also save the URL object for this page, if it could be created
// at all. I catch exceptions that might be thrown by creating or reading the
// page in this method, printing an error message, so that clients know something
// has gone wrong, but needn't (and can't) deal with exceptions themselves.
public WebPage( String url ) {
try {
myURL = new URL( url );
content = null; // Will stay null if read fails
String protocol = myURL.getProtocol();
// Only process URLs whose protocol implies that they might correspond to
// text Web pages:
if ( protocol.equalsIgnoreCase("http")
|| protocol.equalsIgnoreCase("file") ) {
URLConnection connection = myURL.openConnection();
String contentType = connection.getContentType();
if ( contentType != null && contentType.startsWith("text/") ) {
// Get a reader to this page and read its contents.
// Read contents line by line, appending each line to
// a string buffer. At the end, save the buffer as a string
// in the "content" member variable.
StringBuffer contentBuffer = new StringBuffer();
BufferedReader pageReader = new BufferedReader(
new InputStreamReader(
connection.getInputStream() ));
int c = pageReader.read(); // A character from the page
while ( c != -1 ) {
contentBuffer.append( (char)c );
c = pageReader.read();
}
content = contentBuffer.toString();
pageReader.close();
}
}
}
catch ( MalformedURLException error ) {
System.err.println( "'" + url + "' isn't a valid Web page: " + error );
myURL = null;
content = null;
}
catch ( IOException error ) {
System.err.println( "Couldn't read Web page '" + url + "': " + error );
content = null;
}
}
/**
* Return the text stored in a Web page, as a long string. For example
*
* String pageText = somePage.getText();
*
* @return The contents of the page, or null
if the
* page couldn't be fetched from its server for any reason.
*/
// This simply returns the "content" member variable, which is either the
// text of the page or null if anything went wrong in initialization.
public String getText() {
return content;
}
}