Organizing Results in Python

Question 1

Something like this? I guess you will be able to adapt this example to your needs

import pprint
import re

urls = ['http://www.example.com/user/1234',
        'http://www.youtube.com/user/126',
        'http://www.youtube.com/user/125',
        'http://www.forum.com/useryoutube/12'] 

pattern = re.compile('//www\.(\w+)\.')

keys = ['forum', 'youtube']
results = dict()

for u in urls:
    ms = pattern.search(u)
    key = ms.group(1)
    if key in keys:
        results.setdefault(key, []).append(u)

pprint.pprint(results)

Question 2

1/. Create a dict, and assign an empty list to each keyword you have. eg my_dict = {'forums':[],'youtube':[],'unidentified':[]}

2/.Iterate over your urls.

3/. Generate a key for your url,domain name in your case, you can extract the key using re regex module.

4/ Check the dictionary ( of step#1) for this key, if it does not exist, assign it to 'unidentified key, if it exists, append this url to the list in the dictionary with that key.

Question 3

import urlparse

urls = """
http://www.example.com/user/1234
http://www.youtube.com/user/125
http://www.forum.com/user/12
""".split()

categories = {
    "youtube.com": [],
    "forum.com": [],
    "unknown": [],
}

for url in urls:
    netloc = urlparse.urlparse(url).netloc
    if netloc.count(".") == 2:
        # chop sub-domain
        netloc = netloc.split(".", 1)[1]
    if netloc in categories:
        categories[netloc].append(url)
    else:
        categories["unknown"].append(url)
print categories

Parse the urls. Find the category. Append the full url

Question 4

You should probably keep your sorted results in a dictionary and the unsorted ones in a list. You could then sort it like so:

categorized_results = {"forum": [], "youtube": []}
uncategorized_results = []
for i in results:
    i = i.split(".")
    for k in categorized_results:
        j = True
        if k in i:
            categorized_results[k].append(i)
            j = False
        if j:
            uncategorized_results.append(i)

If you'd like to output it neatly:

category_aliases: {"forum": "Forums:", "youtube": "Youtubes:"}
for i in categorized_results:
    print(category_aliases[i])
    for j in categorized_results[i]:
        print(j)
    print("\n")
print("Unidentified:")
print("\n".join(uncategorized_results)) # Let's not put in another for loop.

Question 5

How about this:

from urlparse import urlparse

class Organizing_Results(object):

    CATEGORY = {'example': [], 'youtube': [], 'forum': []}

    def __init__(self):
        self.url_list = []

    def add_single_url(self, url):
        self.url_list.append(urlparse(url))

    def _reduce_result_list(self, acc, element):
        for c in self.CATEGORY:
            if c in element[1]:
                return self.CATEGORY[c].append(element)
        return self.CATEGORY['example'].append(element)

    def get_result(self):
        reduce(lambda x, y: c._reduce_result_list(x, y), c.url_list, [])
        return self.CATEGORY

c = Organizing_Results()          
c.add_single_url('http://www.example.com/user/1234')
c.add_single_url('http://www.youtube.com/user/1234')
c.add_single_url('http://www.unidentified.com/user/1234')
c.get_result()

You can easy broaden the class with more functions as you need.