How to parse XML using Jericho HTML Parser

https://stackoverflow.com/questions/8214989

05-03-2021
|

Question

I'm new to java and servlet and currently trying to parse XML using Jericho XML Parser. For instance, i want to get links from each link tag, but it dose not show anything,and total number says 27(can get only correct total number without string). Anyone who knows how to, please teach me.

import java.io.IOException;
import java.io.PrintWriter;
import javax.servlet.ServletException;
import javax.servlet.annotation.WebServlet;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.*;

import net.htmlparser.jericho.Element;
import net.htmlparser.jericho.Source;

@WebServlet(urlPatterns = { "/HelloServlet"})

public class HelloServlet extends HttpServlet {
private static final long serialVersionUID = 1L;

@Override
protected void doGet(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException,MalformedURLException{

    resp.setContentType("text/html; charset=UTF-8");
    PrintWriter out = resp.getWriter();
    out.println("<html>");
    out.println("<head><meta http-equiv='content-type' content='text/html; charset=UTF-8'></head>");
    out.println("<body>");
    Source source = new Source(new URL("http://news.yahoo.com/rss/"));
    source.fullSequentialParse();


    List<Element> Linklist = source.getAllElements("link");


    if(Linklist!=null){
        out.println("<p>total："+Linklist.size()+"</p>");
        for(Element link: Linklist){
            out.println("<p>"+link.getContent().toString()+"</p>");
        }
    }


    out.println("</body>");
    out.println("</html>");
}


}

Solution

According to the Jericho HTML Parser homepage Jericho is for manipulating HTML documents. But the RSS from Yahoo is XML and you can use Java's standard XML to parse this document and to extract the link tags. Here is an example:

import java.io.IOException;
import java.net.URL;
import java.util.LinkedList;
import java.util.List;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;

// ...

private List<String> getRssLinks() throws ParserConfigurationException,
    SAXException, IOException 
{
  final List<String> rssLinks = new LinkedList<String>();
  final URL url = new URL("http://news.yahoo.com/rss/");
  final Document doc = DocumentBuilderFactory.newInstance().newDocumentBuilder()
                       .parse(url.openStream());
  final NodeList linkNodes = doc.getElementsByTagName("link");
  for(int i = 0; i < linkNodes.getLength(); i++) {
    final Element linkElement = (Element) linkNodes.item(i);
    rssLinks.add(linkElement.getTextContent());
  }

  return rssLinks;
}

Licensed under: CC-BY-SA with attribution

Not affiliated with StackOverflow