Go: How would you "Pretty Print"/"Prettify" HTML?

Question 1

I faced a same problem and I just solved it by creating an HTML formatting package in Go by myself.

Here it is:

Please check this package out.

Thanks,

Keiji

Question 2

I found this question when trying to figure out how to pretty print xml in Go. Since I didn't find the answer anywhere, here's my solution:

import (
    "bytes"
    "encoding/xml"
    "io"
)

func formatXML(data []byte) ([]byte, error) {
    b := &bytes.Buffer{}
    decoder := xml.NewDecoder(bytes.NewReader(data))
    encoder := xml.NewEncoder(b)
    encoder.Indent("", "  ")
    for {
        token, err := decoder.Token()
        if err == io.EOF {
            encoder.Flush()
            return b.Bytes(), nil
        }
        if err != nil {
            return nil, err
        }
        err = encoder.EncodeToken(token)
        if err != nil {
            return nil, err
        }
    }
}

Question 3

EDIT: Found a great way using the XML parser:

package main

import (
    "encoding/xml"
    "fmt"
)

func main() {
    html := "<html><head><title>Website Title</title></head><body><div class=\"random-class\"><h1>I like pie</h1><p>It's true!</p></div></body></html>"
    type node struct {
        Attr     []xml.Attr
        XMLName  xml.Name
        Children []node `xml:",any"`
        Text     string `xml:",chardata"`
    }
    x := node{}
    _ = xml.Unmarshal([]byte(html), &x)
    buf, _ := xml.MarshalIndent(x, "", "\t")
    fmt.Println(string(buf))
}

will output the following:

<html>
    <head>
        <title>Website Title</title>
    </head>
    <body>
        <div>
            <h1>I like pie</h1>
            <p>It&#39;s true!</p>
        </div>
    </body>
</html>

Question 4

Short answer

Use this HTML prettyprint library for Go (that I wrote, *uhum*). It has some tests and works for basic inputs, and will hopefully become more robust over time, though it isn't very robust right now. Note the Known Issues section in the readme.

Long Answer

Rolling your own HTML prettifier for simple cases is reasonably easy using the code.google.com/p/go.net/html package (that's what the above package does). Here is a very simple Prettify function implemented in this way:

func Prettify(raw string, indent string) (pretty string, e error) {
    r := strings.NewReader(raw)
    z := html.NewTokenizer(r)
    pretty = ""
    depth := 0
    prevToken := html.CommentToken
    for {
        tt := z.Next()
        tokenString := string(z.Raw())

        // strip away newlines
        if tt == html.TextToken {
            stripped := strings.Trim(tokenString, "\n")
            if len(stripped) == 0 {
                continue
            }
        }

        if tt == html.EndTagToken {
            depth -= 1
        }

        if tt != html.TextToken {
            if prevToken != html.TextToken {
                pretty += "\n"
                for i := 0; i < depth; i++ {
                    pretty += indent
                }
            }
        }

        pretty += tokenString

        // last token
        if tt == html.ErrorToken {
            break
        } else if tt == html.StartTagToken {
            depth += 1
        }
        prevToken = tt
    }
    return strings.Trim(pretty, "\n"), nil
}

It handles simple examples, like the one you provided. For example,

html := `<!DOCTYPE html><html><head>
<title>Website Title</title>
</head><body>
<div class="random-class">
<h1>I like pie</h1><p>It's true!</p></div>
</body></html>`
pretty, _ := Prettify(html, "    ")
fmt.Println(pretty)

will print the following:

<!DOCTYPE html>
<html>
    <head>
        <title>Website Title</title>
    </head>
    <body>
        <div class="random-class">
            <h1>I like pie</h1>
            <p>It's true!</p>
        </div>
    </body>
</html>

Beware though, this simple approach doesn't yet handle HTML comments, nor does it handle perfectly valid self-closing HTML5 tags that are not XHTML-compliant, like <br>, whitespace is not guaranteed to be preserved when it should, and a whole range of other edge cases I haven't yet thought of. Use it only as a reference, a toy or a starting point :)

Question 5

You could parse the HTML with code.google.com/p/go.net/html, and write your own version of the Render function from that package—one that keeps track of indentation.

But let me warn you: you need to be careful with adding and removing whitespace in HTML. Although whitespace is not usually significant, you can have spaces appearing and disappearing in the rendered text if you're not careful.

Edit:

Here's a pretty-printer function I wrote recently. It handles some of the special cases, but not all of them.

func prettyPrint(b *bytes.Buffer, n *html.Node, depth int) {
    switch n.Type {
    case html.DocumentNode:
        for c := n.FirstChild; c != nil; c = c.NextSibling {
            prettyPrint(b, c, depth)
        }

    case html.ElementNode:
        justRender := false
        switch {
        case n.FirstChild == nil:
            justRender = true
        case n.Data == "pre" || n.Data == "textarea":
            justRender = true
        case n.Data == "script" || n.Data == "style":
            break
        case n.FirstChild == n.LastChild && n.FirstChild.Type == html.TextNode:
            if !isInline(n) {
                c := n.FirstChild
                c.Data = strings.Trim(c.Data, " \t\n\r")
            }
            justRender = true
        case isInline(n) && contentIsInline(n):
            justRender = true
        }
        if justRender {
            indent(b, depth)
            html.Render(b, n)
            b.WriteByte('\n')
            return
        }
        indent(b, depth)
        fmt.Fprintln(b, html.Token{
            Type: html.StartTagToken,
            Data: n.Data,
            Attr: n.Attr,
        })
        for c := n.FirstChild; c != nil; c = c.NextSibling {
            if n.Data == "script" || n.Data == "style" && c.Type == html.TextNode {
                prettyPrintScript(b, c.Data, depth+1)
            } else {
                prettyPrint(b, c, depth+1)
            }
        }
        indent(b, depth)
        fmt.Fprintln(b, html.Token{
            Type: html.EndTagToken,
            Data: n.Data,
        })

    case html.TextNode:
        n.Data = strings.Trim(n.Data, " \t\n\r")
        if n.Data == "" {
            return
        }
        indent(b, depth)
        html.Render(b, n)
        b.WriteByte('\n')

    default:
        indent(b, depth)
        html.Render(b, n)
        b.WriteByte('\n')
    }
}

func isInline(n *html.Node) bool {
    switch n.Type {
    case html.TextNode, html.CommentNode:
        return true
    case html.ElementNode:
        switch n.Data {
        case "b", "big", "i", "small", "tt", "abbr", "acronym", "cite", "dfn", "em", "kbd", "strong", "samp", "var", "a", "bdo", "img", "map", "object", "q", "span", "sub", "sup", "button", "input", "label", "select", "textarea":
            return true
        default:
            return false
        }
    default:
        return false
    }
}

func contentIsInline(n *html.Node) bool {
    for c := n.FirstChild; c != nil; c = c.NextSibling {
        if !isInline(c) || !contentIsInline(c) {
            return false
        }
    }
    return true
}

func indent(b *bytes.Buffer, depth int) {
    depth *= 2
    for i := 0; i < depth; i++ {
        b.WriteByte(' ')
    }
}

func prettyPrintScript(b *bytes.Buffer, s string, depth int) {
    for _, line := range strings.Split(s, "\n") {
        line = strings.TrimSpace(line)
        if line == "" {
            continue
        }
        depthChange := 0
        for _, c := range line {
            switch c {
            case '(', '[', '{':
                depthChange++
            case ')', ']', '}':
                depthChange--
            }
        }
        switch line[0] {
        case '.':
            indent(b, depth+1)
        case ')', ']', '}':
            indent(b, depth-1)
        default:
            indent(b, depth)
        }
        depth += depthChange
        fmt.Fprintln(b, line)
    }
}