Question

I am trying to scrape multiple json pages but it is not storing anything in file. Also I have to define number of pages. How to automatically adjust it till last page.

import urllib

for x in xrange(1,5):
   url = 'http://api.opencorporates.com/v0.2/companies/search?q=&jurisdiction_code=us_ca&current_status=Active&page={0}'.format(x)
   file_name = url.split('/')[-1]
   u = urllib.urlopen(url)
   f = open(file_name, 'wb')
   meta = u.info()
   file_size = int(meta.getheaders("Content-Length")[0])
   print "Downloading: %s Bytes: %s" % (file_name, file_size)
   file_size_dl = 0
   block_sz = 8192
   while True:
      buffer = u.read(block_sz)
      if not buffer:
           break
   file_size_dl += len(buffer)
   f.write(buffer)
   status = r"%10d  [%3.2f%%]" % (file_size_dl, file_size_dl * 100. / file_size)
   status = status + chr(8)*(len(status)+1)
   print status,
   f.close()
Was it helpful?

Solution

import urllib2
import json
url = 'http://api.opencorporates.com/v0.2/companies/search?q=&jurisdiction_code=us_ca&current_status=Active&page='
i = 0
while True:
    i += 1
    print i
    response = urllib2.urlopen('%s%d' % (url, i))
    content = response.read()
    with open(str(i) + '.json', 'w') as f:
        f.write(content)

at the 22nd page I got HTTP Error 401: Unauthorized

Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top