Issue iterating through a list of possible URLs to download files

Question 1

A better option would be to build the correct absolute URLs to start with:

def main(soup, domain, path, types):
    for link in soup.findAll(href = compile(types)):
        file = link.get('href')

        # Make file URL absolute here
        if '://' not in file and not file.startswith('//'):
            if not file.startswith('/'):
                file = urlparse.urljoin(path, file)
            file = urlparse.urljoin(domain, file)

        try:
            urlretrieve(file)
        except:
            print 'Error retrieving %s using URL %s' % (
                link.get('href'), file)

for url in URLs:
    html_data = urlopen(url)
    soup = BeautifulSoup(html_data)

    urlinfo = urlparse.urlparse(url)
    domain = urlparse.urlunparse((urlinfo.scheme, urlinfo.netloc, '', '', '', ''))
    path = urlinfo.path.rsplit('/', 1)[0]

    for types in FILETYPE:
        main(soup, domain, path, types)

The urlparse function is used to split the source URL into two segments: domain contains the URI scheme and the domain name, path contains the "directory" of the target file on the server. For example:

>>> url = "http://www.example.com/some/web/page.html"
>>> urlinfo = urlparse.urlparse(url)
>>> urlinfo
ParseResult(scheme='http', netloc='www.example.com',
            path='/some/web/page.html', params='', query='', fragment='')
>>> domain = urlparse.urlunparse((urlinfo.scheme, urlinfo.netloc, '', '', '', ''))
>>> domain
'http://www.example.com'
>>> path = urlinfo.path.rsplit('/', 1)[0]
>>> path
'/some/web'

Then domain and path are used as base path for the hrefs encountered:

if the href contains "://" or starts with "//", assume it is absolute: no modification needed,
else if the href starts with "/", it is relative to the domain: prepend the domain,
otherwise the href is relative to the path: prepend the domain and the base path.

Question 2

Assumin download method will download the file and return True if its successfully downloaded, or False if it failed... then this goes through the all the possible file paths given by urls and files.

def download(url, file):
    print url + file;
    //assuming download failed, returning False, so it will loop through all the files for this demo purpose.
    return False;

def main():
    urls = ["example.com/", "example.com/docs/", "example.com/dir/docs/", "example.com/dir/doocs/files/"]

    files = ["file1.pdf", "file2.pdf", "file3.pdf"]

    for file in files:
        for url in urls:
            success = download(url, file, False)
            if success:
                 break


main()

Question 3

You need to catch the exception and try the next base url. That said, you can also attempt to make the links absolute before issuing requests. I believe that is the best approach since it avoids making lots of unnecessary requests. lxml has a handy make_links_absolute() function for this purpose.

Also check out urlparse.urljoin for this. Continuing with the approach you are already using...

html_data = urlopen(url)
soup = BeautifulSoup(html_data)
for link in soup.findAll(href = compile(types)):
    file = link.get('href')
    for domain in (url, 'http://www.one.com', 'http://www.two.com'):
        path = urlparse.urljoin(domain, file)
        try:
            req = urllib.urlretrieve(url)
            break  # stop trying new domains
        except:
            print 'Error downloading {0}'.format(url)
            # will go to the next domain

If I were doing this with lxml it would be something like:

req = urlopen(url)
html = req.read()
root = lxml.html.fromstring(root)
root.make_links_absolute()  # automatically add the domain to the links
for a in root.iterlinks():
    if a[2].endswith('pdf'):
        # download link ending with pdf
        req = urlretrieve(a[2])