CSDN博客

img caoxu1987728

对标签(A)的解析

发表于2008/9/29 12:11:00  522人阅读

最近再看一本关于spider的书,偶尔想起了对标签<A>的解析过程。偶就把代码拷贝过来了,以备不是之需:

  1. import java.io.BufferedWriter;
  2. import java.io.File;
  3. import java.io.FileWriter;
  4. import java.io.IOException;
  5. import java.util.Date;
  6. import org.htmlparser.NodeFilter;
  7. import org.htmlparser.filters.AndFilter;
  8. import org.htmlparser.filters.HasAttributeFilter;
  9. import org.htmlparser.filters.NotFilter;
  10. import org.htmlparser.filters.TagNameFilter;
  11. import org.htmlparser.tags.LinkTag;
  12. import org.htmlparser.util.NodeList;
  13. public class Extractxxx extends Extractor {
  14.     public void extract() 
  15.     {
  16.         BufferedWriter bw = null;
  17.         
  18.         NodeFilter title_filter = new AndFilter(new TagNameFilter("a"),
  19.                         new NotFilter(new HasAttributeFilter("href","../")));
  20.         //提取标题信息
  21.         try {
  22.             //Parser根据过滤器返回所有满足过滤条件的节点
  23.             NodeList title_nodes = this.getParser().parse(title_filter);
  24.             for(int i=0;i<title_nodes.size();i++)
  25.             {
  26.                 LinkTag node=(LinkTag)title_nodes.elementAt(i);
  27.                 String[] names=node.getChildrenHTML().split("//.");
  28.                 
  29.                 StringBuffer title=new StringBuffer();
  30.                 
  31.                 for(int j=0;j<names.length;j++)
  32.                 {
  33.                     title.append(names[j]).append("-");
  34.                 }
  35.                 title.append((new Date()).getTime());
  36.                 
  37.                 bw= new BufferedWriter(new FileWriter(new File(this
  38.                         .getOutputPath()
  39.                         + title + ".txt")));
  40.                 
  41.                 
  42.                 String url=node.getAttribute("href");
  43.                 
  44.                 int startPos=getInputFilePath().indexOf(":")+2;
  45.                 int endPos=getInputFilePath().lastIndexOf(".")-5;
  46.                 String url_seg=getInputFilePath().substring(startPos, endPos);
  47.                 url_seg=url_seg.replaceAll("////", "/");
  48.                 String originalUrl=url_seg+url;
  49.                 System.out.println(originalUrl);
  50.                 bw.write(originalUrl+NEWLINE);
  51.                 for(int k=0;k<names.length;k++)
  52.                 {
  53.                     bw.write(names[k]+NEWLINE);
  54.                 }
  55.                 
  56.                 try
  57.                 {
  58.                     if (bw != null)
  59.                         bw.close();
  60.                 }catch(IOException e)
  61.                 {
  62.                     e.printStackTrace();
  63.                 }   
  64.             }
  65.             
  66.             /*
  67.             NodeIterator it=title_nodes.elements();
  68.             while(it.hasMoreNodes())
  69.             {
  70.                 Node node=(Node)it.nextNode();
  71.                 System.out.println(node.toHtml());
  72.             }
  73.             */
  74.         } catch (Exception e) {
  75.             e.printStackTrace();
  76.         }
  77.     }
  78.     
  79.     public static void main(String[] args) throws Exception 
  80.     {
  81.         Extractor extractor = new ExtractJindui();
  82.         extractor.setOutputPath("c://film//xxx//");
  83.         File path= new File("F://xxx.xxx.xxx//xxxx");
  84.         traverse(extractor,path);
  85.     }
  86. }

重点注意的是:

1,LinkTag的使用;

2,String[] names=node.getChildrenHTML().split("//.");里面正则表达式的使用;

3,String url=node.getAttribute("href"); href后面内容的获取。

 

OK!

 

0 0

相关博文

我的热门文章

img
取 消
img