التخزين المؤقت في urllib2؟

https://stackoverflow.com/questions/148853

02-07-2019
|

سؤال

هل هناك طريقة سهلة لتخزين الأشياء مؤقتًا عند استخدام urllib2 التي أبالغ في البحث عنها، أم هل يجب عليّ أن أقوم بتدوينها بنفسي؟

المحلول

يمكنك استخدام وظيفة الديكور مثل:

class cache(object):
    def __init__(self, fun):
        self.fun = fun
        self.cache = {}

    def __call__(self, *args, **kwargs):
        key  = str(args) + str(kwargs)
        try:
            return self.cache[key]
        except KeyError:
            self.cache[key] = rval = self.fun(*args, **kwargs)
            return rval
        except TypeError: # incase key isn't a valid key - don't cache
            return self.fun(*args, **kwargs)

وتحديد وظيفة على غرار:

@cache
def get_url_src(url):
    return urllib.urlopen(url).read()

هذا على افتراض أنك لا تهتم بعناصر التحكم في ذاكرة التخزين المؤقت لـ HTTP، ولكنك تريد فقط تخزين الصفحة مؤقتًا طوال مدة التطبيق

نصائح أخرى

إذا كنت لا تمانع في العمل بمستوى أقل قليلاً، httplib2 (https://github.com/httplib2/httplib2) هي مكتبة HTTP ممتازة تتضمن وظيفة التخزين المؤقت.

قد تكون وصفة ActiveState Python هذه مفيدة:http://code.activestate.com/recipes/491261/

لقد كنت دائمًا في حيرة بين استخدام httplib2، الذي يقوم بعمل قوي في التعامل مع التخزين المؤقت والمصادقة لـ HTTP، وurllib2، الموجود في stdlib، وله واجهة قابلة للتوسيع، ويدعم خوادم HTTP Proxy.

ال وصفة ActiveState يبدأ بإضافة دعم التخزين المؤقت إلى urllib2، ولكن بطريقة بدائية للغاية.إنه يفشل في السماح بالتوسيع في آليات التخزين، مما يؤدي إلى ترميز التخزين المدعوم بنظام الملفات.كما أنه لا يحترم رؤوس ذاكرة التخزين المؤقت HTTP.

في محاولة للجمع بين أفضل ميزات التخزين المؤقت httplib2 وقابلية التوسعة urllib2، قمت بتكييف وصفة ActiveState لتنفيذ معظم وظائف التخزين المؤقت نفسها الموجودة في httplib2.الوحدة موجودة في jaraco.net باسم jaraco.net.http.caching.يشير الرابط إلى الوحدة كما كانت موجودة في وقت كتابة هذه السطور.على الرغم من أن هذه الوحدة هي حاليًا جزء من حزمة jaraco.net الأكبر حجمًا، إلا أنها لا تحتوي على تبعيات داخل الحزمة، لذا لا تتردد في سحب الوحدة واستخدامها في مشاريعك الخاصة.

وبدلاً من ذلك، إذا كان لديك Python 2.6 أو إصدار أحدث، فيمكنك ذلك easy_install jaraco.net>=1.3 ثم استخدم CachingHandler بشيء مثل الكود الموجود caching.quick_test().

"""Quick test/example of CacheHandler"""
import logging
import urllib2
from httplib2 import FileCache
from jaraco.net.http.caching import CacheHandler

logging.basicConfig(level=logging.DEBUG)
store = FileCache(".cache")
opener = urllib2.build_opener(CacheHandler(store))
urllib2.install_opener(opener)
response = opener.open("http://www.google.com/")
print response.headers
print "Response:", response.read()[:100], '...\n'

response.reload(store)
print response.headers
print "After reload:", response.read()[:100], '...\n'

لاحظ أن jaraco.util.http.caching لا يوفر مواصفات لمخزن النسخ الاحتياطي لذاكرة التخزين المؤقت، ولكنه يتبع بدلاً من ذلك الواجهة التي يستخدمها httplib2.لهذا السبب، يمكن استخدام httplib2.FileCache مباشرة مع urllib2 وCacheHandler.أيضًا، يجب أن تكون ذاكرة التخزين المؤقت الاحتياطية الأخرى المصممة لـ httplib2 قابلة للاستخدام بواسطة CacheHandler.

كنت أبحث عن شيء مماثل، وجاء عبر "الوصفة 491261:التخزين المؤقت والاختناق لـ urllib2" الذي نشره دانيفو.المشكلة هي أنا حقًا لا يعجبني رمز التخزين المؤقت (الكثير من الازدواجية، والكثير من الانضمام يدويًا لمسارات الملفات بدلاً من استخدام os.path.join، ويستخدم الأساليب الثابتة، وغير PEP8'sih، وأشياء أخرى أحاول تجنبها)

الكود أجمل قليلاً (في رأيي على أي حال) وهو متماثل من الناحية الوظيفية إلى حد كبير، مع بعض الإضافات - بشكل أساسي طريقة "التخزين المؤقت" (مثال للاستخدام يمكن أن يبدو هنا, ، أو في if __name__ == "__main__": القسم في نهاية الكود).

يمكن العثور على أحدث إصدار في http://github.com/dbr/tvdb_api/blob/master/cache.py, ، وسألصقه هنا للأجيال القادمة (مع إزالة الرؤوس الخاصة بالتطبيق الخاص بي):

#!/usr/bin/env python
"""
urllib2 caching handler
Modified from http://code.activestate.com/recipes/491261/ by dbr
"""

import os
import time
import httplib
import urllib2
import StringIO
from hashlib import md5

def calculate_cache_path(cache_location, url):
    """Checks if [cache_location]/[hash_of_url].headers and .body exist
    """
    thumb = md5(url).hexdigest()
    header = os.path.join(cache_location, thumb + ".headers")
    body = os.path.join(cache_location, thumb + ".body")
    return header, body

def check_cache_time(path, max_age):
    """Checks if a file has been created/modified in the [last max_age] seconds.
    False means the file is too old (or doesn't exist), True means it is
    up-to-date and valid"""
    if not os.path.isfile(path):
        return False
    cache_modified_time = os.stat(path).st_mtime
    time_now = time.time()
    if cache_modified_time < time_now - max_age:
        # Cache is old
        return False
    else:
        return True

def exists_in_cache(cache_location, url, max_age):
    """Returns if header AND body cache file exist (and are up-to-date)"""
    hpath, bpath = calculate_cache_path(cache_location, url)
    if os.path.exists(hpath) and os.path.exists(bpath):
        return(
            check_cache_time(hpath, max_age)
            and check_cache_time(bpath, max_age)
        )
    else:
        # File does not exist
        return False

def store_in_cache(cache_location, url, response):
    """Tries to store response in cache."""
    hpath, bpath = calculate_cache_path(cache_location, url)
    try:
        outf = open(hpath, "w")
        headers = str(response.info())
        outf.write(headers)
        outf.close()

        outf = open(bpath, "w")
        outf.write(response.read())
        outf.close()
    except IOError:
        return True
    else:
        return False

class CacheHandler(urllib2.BaseHandler):
    """Stores responses in a persistant on-disk cache.

    If a subsequent GET request is made for the same URL, the stored
    response is returned, saving time, resources and bandwidth
    """
    def __init__(self, cache_location, max_age = 21600):
        """The location of the cache directory"""
        self.max_age = max_age
        self.cache_location = cache_location
        if not os.path.exists(self.cache_location):
            os.mkdir(self.cache_location)

    def default_open(self, request):
        """Handles GET requests, if the response is cached it returns it
        """
        if request.get_method() is not "GET":
            return None # let the next handler try to handle the request

        if exists_in_cache(
            self.cache_location, request.get_full_url(), self.max_age
        ):
            return CachedResponse(
                self.cache_location,
                request.get_full_url(),
                set_cache_header = True
            )
        else:
            return None

    def http_response(self, request, response):
        """Gets a HTTP response, if it was a GET request and the status code
        starts with 2 (200 OK etc) it caches it and returns a CachedResponse
        """
        if (request.get_method() == "GET"
            and str(response.code).startswith("2")
        ):
            if 'x-local-cache' not in response.info():
                # Response is not cached
                set_cache_header = store_in_cache(
                    self.cache_location,
                    request.get_full_url(),
                    response
                )
            else:
                set_cache_header = True
            #end if x-cache in response

            return CachedResponse(
                self.cache_location,
                request.get_full_url(),
                set_cache_header = set_cache_header
            )
        else:
            return response

class CachedResponse(StringIO.StringIO):
    """An urllib2.response-like object for cached responses.

    To determine if a response is cached or coming directly from
    the network, check the x-local-cache header rather than the object type.
    """
    def __init__(self, cache_location, url, set_cache_header=True):
        self.cache_location = cache_location
        hpath, bpath = calculate_cache_path(cache_location, url)

        StringIO.StringIO.__init__(self, file(bpath).read())

        self.url     = url
        self.code    = 200
        self.msg     = "OK"
        headerbuf = file(hpath).read()
        if set_cache_header:
            headerbuf += "x-local-cache: %s\r\n" % (bpath)
        self.headers = httplib.HTTPMessage(StringIO.StringIO(headerbuf))

    def info(self):
        """Returns headers
        """
        return self.headers

    def geturl(self):
        """Returns original URL
        """
        return self.url

    def recache(self):
        new_request = urllib2.urlopen(self.url)
        set_cache_header = store_in_cache(
            self.cache_location,
            new_request.url,
            new_request
        )
        CachedResponse.__init__(self, self.cache_location, self.url, True)


if __name__ == "__main__":
    def main():
        """Quick test/example of CacheHandler"""
        opener = urllib2.build_opener(CacheHandler("/tmp/"))
        response = opener.open("http://google.com")
        print response.headers
        print "Response:", response.read()

        response.recache()
        print response.headers
        print "After recache:", response.read()
    main()

هذه المقالة على شبكة مطوري Yahoo - http://developer.yahoo.com/python/python-caching.html - يصف كيفية تخزين مكالمات http التي تم إجراؤها من خلال urllib إما على الذاكرة أو القرص.

@دبر:قد تحتاج إلى إضافة استجابات https أيضًا للتخزين المؤقت باستخدام:

def https_response(self, request, response):
    return self.http_response(request,response)

مرخصة بموجب: CC-BY-SA مع الإسناد

لا تنتمي إلى StackOverflow