CSDN博客

img redvalley

写个程序自动下载留言,没辙了:(

发表于2004/12/30 0:39:00  915人阅读

刚写了个HTML文件解析器,生成简单的Element树,包括所有元素。

虽然有很多解析HTML的api,只是我只要用那么一点点功能,写一下也很快。

类:HTMLElement、HTMLParser、LoginAgent

目的是为了用程序去自动下载留言页面,使用HttpURLConnection post登录,搜索出loginForm的所有input,只需特别设置用户密码即可。

但是,页面打开后,里面的链接居然还要post数据,问题是input的值是页面上的javascript设置的,所以InputStream下载得到的没有值,晕倒!!没辙了。

package com.forum;
import java.io.*;
/**
 *
 * <p>Title: </p>
 * <p>Description: </p>
 * <p>Copyright: Copyright (c) 2004</p>
 * <p>Company: </p>
 * @author zfzheng 猪神
 * @version 1.0
 */
public class HTMLParser {

    public static HTMLElement buildHTMLElementFromFile(String fileName)throws IOException{
        BufferedReader br=new BufferedReader(new InputStreamReader(new FileInputStream(fileName),"GB2312"));
        String line;
        StringBuffer result=new StringBuffer();
        while((line=br.readLine())!=null){
            result.append(line);
        }
        br.close();
        return buildHTMLElementFromString(result.toString());
    }


    public static HTMLElement buildHTMLElementFromString(String data){
        HTMLElement root=new HTMLElement();
        HTMLElement parent=root;
        root.setPrototypeString("根节点");
        HTMLElement e=null;
        char[] chs=data.toCharArray();
        char ch;
        int i=0;
        StringBuffer tagName=new StringBuffer();
        boolean inCommont=false;
        boolean tagDefineBegin=false;
        boolean tagDefineEnd=false;

        while(i<chs.length){
            ch=chs[i];
            if(inCommont){
                tagName.append(ch);
                if(chs[i]=='>' && i-2>=0 && chs[i-1]=='-' && chs[i-2]=='-'){//注释结束
                    inCommont=false;
                }
                continue;
            }

            switch(ch){
                case '<':{
                    if(i+1<chs.length && chs[i+1]!='/'){
                        tagDefineBegin = true;
                        tagDefineEnd=false;
                        e = new HTMLElement();
                    }else if(i+3<chs.length&& chs[i+1]=='!' &&chs[i+2]=='-' &&chs[i+3]=='-'){//注释
                        tagName.append(ch);
                        inCommont=true;
                    }else{//标记结束,等下一个循环
                    }
                    break;
                }
                case '>':{
                    if(tagDefineEnd){//结束
                        if (HTMLElement.isContentTag(tagName.toString())) { //可包含的标记结束,父节点返回上一层
                            if (parent.getParent() != null) { //返回上一层父节点
                                parent = parent.getParent();
                            }
                        }
                    }else{
                        tagDefineBegin = false;
                        e.setPrototypeString(tagName.toString());
                        parent.addChildren(e);
                        e.setParent(parent);
                        if (HTMLElement.isContentTag(e.getTagName())) { //可包含的标记
                            parent = e;
                        }else{
                            tagDefineEnd=true;
                        }
                    }
                    tagName.setLength(0);
                    break;
                }
                case '/':{
                    if(i>0 && chs[i-1]=='<'){
                        tagDefineEnd=true;
                        tagName.setLength(0);
                    }
                    break;
                }
                default:{
                    if(tagDefineBegin){
                        tagName.append(ch);
                    }else if(tagDefineEnd){
                        tagName.append(ch);
                    }
                    break;
                }

            }
            i++;
        }
        return root;
    }

public static void main(String[] args) {
    HTMLParser.buildHTMLElementFromString("<html><head><title> New Document </title><meta name='Generator' content='EditPlus'><meta ame='Author' content=''><meta name='Keywords' content=''><meta name='Description' content=''></head><body></body></html>").dump();
    try {
        HTMLParser.buildHTMLElementFromFile("d:/bb.htm").dumpHTML();
    } catch (IOException ex) {
        ex.printStackTrace();
    }

}
}

0 0

相关博文

我的热门文章

img
取 消
img即使是一小步
也想与你分享
打开
img