`
gcgmh
  • 浏览: 349341 次
  • 性别: Icon_minigender_1
  • 来自: 北京
社区版块
存档分类
最新评论

nekohtml 用法

阅读更多
//nekohtml结合xpath用法
DOMParser parser = new DOMParser();   
    try {   
           //设置网页的默认编码   
           parser.setProperty("http://cyberneko.org/html/properties/default-encoding","gb2312");   
           /*The Xerces HTML DOM implementation does not support namespaces   
           and cannot represent XHTML documents with namespace information.   
           Therefore, in order to use the default HTML DOM implementation with NekoHTML's   
           DOMParser to parse XHTML documents, you must turn off namespace processing.*/  
           parser.setFeature("http://xml.org/sax/features/namespaces", false);   
  
           String strURL = "http://product.dangdang.com/product.aspx?product_id=9317290";   
           BufferedReader in = new BufferedReader(   
                   new InputStreamReader(   
                           new URL(strURL).openStream()));   
           parser.parse(new InputSource(in));   
           in.close();   
          } catch (Exception e) {   
           e.printStackTrace();   
          }   
          Document doc = parser.getDocument();   
          // tags should be in upper case   
          String productsXpath = "/HTML/BODY/DIV[2]/DIV[4]/DIV[2]/DIV/DIV[3]/UL[@class]/LI[9]";   
          NodeList products;   
          try {   
              products = XPathAPI.selectNodeList(doc, productsXpath);   
              System.out.println("found: " + products.getLength());   
              Node node = null;   
              for(int i=0; i< products.getLength();i++)   
              {   
                  node = products.item(i);   
                  System.out.println( i + ":\n" + node.getTextContent());   
              }   
          }catch (TransformerException e) {   
              e.printStackTrace();   
          }   

分享到:
评论

相关推荐

Global site tag (gtag.js) - Google Analytics