How to loop python to read a set of HTML files and dump into JSON

Question 1

You can use glob.iglob to loop through all html files in a directory. For every filename, pass the file-like object to the BeautifulSoup constructor, get the elements you need and construct a dictionary:

import glob
from bs4 import BeautifulSoup

for filename in glob.iglob('*.html'):
    with open(filename) as f:
        soup = BeautifulSoup(f)

        title = soup.find("span", id="btAsinTitle")
        author = title.find_next("a", href=True)
        isbn = soup.find('b', text='ISBN-10:').next_sibling
        weight = soup.find('b', text='Shipping Weight:').next_sibling

        print {'title': title.get_text(),
               'author': author.get_text(),
               'isbn': isbn,
               'weight': weight}

Question 2

To process set of files in some directory:

from glob import glob
fnames = glob("datadir/*.html")
for fname in fnames:
  html2json(fname)

Now we need the function html2json, it shall get name of html file and will write json string to a file with the same name as has the html, but with added json extension.

import json
from bs4 import BeautifulSoup

def html2json(fname):
  resdct = {}
  with open(fname) as f:
    soup = BeautifulSoup(f)

    title = soup.find("span", id="btAsinTitle")
    resdct["title"] = title.get_text()
    resdct["author"] = title.find_next("a", href=True).get_text()
    resdct["isbn"] = soup.find('b', text='ISBN-10:').next_sibling.get_text()
    resdct["weight"] = soup.find('b', text='Shipping Weight:').next_sibling.get_text()

  outfname = fname + ".json"
  with open(outfname, "w") as f:
    json.dump(resdct, f)