Question

I am new to coding and and trying to learn as I go.

I'm trying to create a python script that will grab and print all HEADERS from a list of urls in a txt file.

It seems to be getting there but im stuck in an infinite loop with one of the urls and I have no idea why and for some reason the "-h", or "--help" wont return the usage(). Any help would be appreciated.

Below is what I have so far:

 #!/usr/bin/python

 import pycurl
 import cStringIO
 import sys, getopt

 buf = cStringIO.StringIO()
 c = pycurl.Curl()

 def usage():
     print "-h --help, -i --urlist, -o --proxy"
     sys.exit()

 def main(argv):
    iurlist = None
    proxy = None
    try:
       opts, args = getopt.getopt(argv,"hi:o:t",["help", "iurlist=","proxy="])
       if not opts:
         print "No options supplied"
         print "Type -h for help"
         sys.exit()
    except getopt.GetoptError as err:
       print str(err)
       usage()
       sys.exit(2)

    for opt, arg in opts:
       if opt == ("-h", "--help"):
          usage()
          sys.exit()
       elif opt in ("-i", "--iurlist"):
           iurlist = arg
       elif opt in ("-o", "--proxy"):
           proxy = arg
       else:
          assert False, "Unhandeled option"

 with open(iurlist) as f:
      iurlist = f.readlines()
      print iurlist

 try:
      for i in iurlist:
            c.setopt(c.URL, i)
            c.setopt(c.PROXY, proxy)
            c.setopt(c.HEADER, 1)
            c.setopt(c.FOLLOWLOCATION, 1)
            c.setopt(c.MAXREDIRS, 30)
            c.setopt(c.USERAGENT, 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0')
            c.setopt(c.TIMEOUT, 8)
            c.setopt(c.CONNECTTIMEOUT, 5)
            c.setopt(c.NOBODY, 1)
            c.setopt(c.PROXY, proxy)
            c.setopt(c.WRITEFUNCTION, buf.write)
            c.setopt(c.SSL_VERIFYPEER, 0)
            c.perform()
            print buf.getvalue()
            buf.close

  except pycurl.error, error:
       errno, errstr = error
       print 'An error has occurred: ', errstr

 if __name__ == "__main__":
    main(sys.argv[1:])

This is the latest code:

 #!/usr/bin/python

 import pycurl
 import cStringIO
 import sys, getopt

 c = pycurl.Curl()

 def usage():
     print "-h --help, -i --urlist, -o --proxy"
     print "Example Usage: cURLdect.py -i urlist.txt -o http://192.168.1.64:8080"
     sys.exit()

 def main(argv):
    iurlist = None
    proxy = None
    try:
       opts, args = getopt.getopt(argv,"hi:o:t",["help", "iurlist=","proxy="])
       if not opts:
         print "No options supplied"
         print "Type -h for help"
         sys.exit()
    except getopt.GetoptError as err:
       print str(err)
       usage()
       sys.exit(2)

    for opt, arg in opts:
       if opt in ("-h", "--help"):
          usage()
          sys.exit()
       elif opt in ("-i", "--iurlist"):
          iurlist = arg
       elif opt in ("-o", "--proxy"):
          proxy = arg
       else:
          assert False, "Unhandeled option"

    with open(iurlist) as f:
         iurlist = f.readlines()
         print iurlist

    try:
         for i in iurlist:
            buf = cStringIO.StringIO()
            c.setopt(c.WRITEFUNCTION, buf.write)
            c.setopt(c.PROXY, proxy)
            c.setopt(c.HEADER, 1)
            c.setopt(c.FOLLOWLOCATION, 1)
            c.setopt(c.MAXREDIRS, 300)
            c.setopt(c.USERAGENT, 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0')
            c.setopt(c.TIMEOUT, 8)
            c.setopt(c.CONNECTTIMEOUT, 5)
            c.setopt(c.NOBODY, 1)
            c.setopt(c.SSL_VERIFYPEER, 0)
            c.setopt(c.URL, i)
            c.perform()
            print buf.getvalue()
            buf.close()
    except pycurl.error, error:
         errno, errstr = error
         print 'An error has occurred: ', errstr

 if __name__ == "__main__":
    main(sys.argv[1:])
Was it helpful?

Solution

If you are learning, pycurl may not be the best option. They asume you're familiar with the libcurl library. From http://pycurl.sourceforge.net/:

PycURL is targeted at an advanced developer - if you need dozens of concurrent, fast and reliable connections or any of the sophisticated features listed above then PycURL is for you.

The main drawback of PycURL is that it is a relatively thin layer over libcurl without any of those nice Pythonic class hierarchies. This means it has a somewhat steep learning curve unless you are already familiar with libcurl's C API.

This is how they do a multi-fetch: https://github.com/pycurl/pycurl/blob/master/examples/retriever-multi.py


To fetch the headers a la python, install the requests library, and just do:

for url in list_of_urls:
    r = requests.get(url)
    print r.headers

To deal with command line arguments, use the argparser in the batteries included with python.

OTHER TIPS

You're using

if opt == ("-h", "--help"):

for the help option, but

if opt in (....)

for all the other options. opt is either -h or --help, but not both, so you need to use in to check if opt is either of it as well.

Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top