Parsing DOM returned from JTidy to find a particular HTML element

https://stackoverflow.com//questions/9572891

07-12-2019
|

Question

I have been playing with this code for a while, and I am not certain what I am doing wrong.

I get a url, clean it up with JTidy, as it isn't well-formed, then I need to find a particular hidden input field (input type="hidden" name="mytarget" value="313"), so I know the value in the name attribute.

I have it printing out the entire html page when it cleans it up, just so I can compare what I am looking for with what is in the document.

My problem is trying to determine the best way to find this, about where I have System.out << it.

    def http = new HTTPBuilder( url )
    http.request(GET,TEXT) { req ->
        response.success = { resp, reader ->
            assert resp.status == 200
            def tidy = new Tidy()
            def node = tidy.parse(reader, System.out)
            def doc = tidy.parseDOM(reader, null).documentElement
            def nodes = node.last.last
            nodes.each{System.out << it}
        }
        response.failure = { resp -> println resp.statusLine }
    }

Solution

Have you tried taking a look at JSoup instead of JTidy? I'm not sure how well it handles malformed HTML content, but I've used it successfully in parsing an HTML page and finding the element I needed using JQuery style selectors. This is much easier than traversing the DOM manually unless you know the exact layout of the DOM.

@Grab(group='org.codehaus.groovy.modules.http-builder', module='http-builder', version='0.5.2')
@Grab(group='org.jsoup', module='jsoup', version='1.6.1')

import groovyx.net.http.HTTPBuilder
import static groovyx.net.http.Method.GET
import static groovyx.net.http.ContentType.TEXT
import org.jsoup.Jsoup

def url = 'http://stackoverflow.com/questions/9572891/parsing-dom-returned-from-jtidy-to-find-a-particular-html-element'

new HTTPBuilder(url).request(GET, TEXT) { req ->
    response.success = { resp, reader ->
        assert resp.status == 200
        def doc = Jsoup.parse(reader.text)
        def els = doc.select('input[type=hidden]')
        els.each {
            println it.attr('name') + '=' + it.attr('value')
        }
    }
    response.failure = { resp -> println resp.statusLine }
}

OTHER TIPS

You could also use nekohtml:

@Grab(group='org.codehaus.groovy.modules.http-builder', module='http-builder', version='0.5.2')
@Grab(group='net.sourceforge.nekohtml', module='nekohtml', version='1.9.15')

import groovyx.net.http.HTTPBuilder
import static groovyx.net.http.Method.GET
import static groovyx.net.http.ContentType.TEXT
import org.cyberneko.html.parsers.SAXParser

def url = 'http://stackoverflow.com/questions/9572891/parsing-dom-returned-from-jtidy-to-find-a-particular-html-element'

new HTTPBuilder(url).request(GET, TEXT) { req ->
    response.success = { resp, reader ->
        assert resp.status == 200
        def doc = new XmlSlurper( new SAXParser() ).parseText( reader.text )
        def els = doc.depthFirst().grep { it.name() == 'INPUT' && it.@type?.toString() == 'hidden' }
        els.each {
            println "${it.@name}=${it.@value}"
        }
    }
    response.failure = { resp -> println resp.statusLine }
}

Licensed under: CC-BY-SA with attribution

Not affiliated with StackOverflow