urllib2의 캐싱?

https://stackoverflow.com/questions/148853

02-07-2019
|

문제

urllib2를 사용할 때 과도하게 보이고 나 자신을 굴려야합니까?

해결책

다음과 같은 데코레이터 기능을 사용할 수 있습니다.

class cache(object):
    def __init__(self, fun):
        self.fun = fun
        self.cache = {}

    def __call__(self, *args, **kwargs):
        key  = str(args) + str(kwargs)
        try:
            return self.cache[key]
        except KeyError:
            self.cache[key] = rval = self.fun(*args, **kwargs)
            return rval
        except TypeError: # incase key isn't a valid key - don't cache
            return self.fun(*args, **kwargs)

선을 따라 함수를 정의합니다.

@cache
def get_url_src(url):
    return urllib.urlopen(url).read()

이것은 HTTP 캐시 컨트롤에주의를 기울이지 않는다고 가정하지만 응용 프로그램 기간 동안 페이지를 캐시하려고합니다.

다른 팁

약간 낮은 수준에서 일하는 것이 마음에 들지 않으면 httplib2 (https://github.com/httplib2/httplib2)는 캐싱 기능을 포함하는 우수한 HTTP 라이브러리입니다.

이 파이썬 레시피를 활성화시킬 수 있습니다.http://code.activestate.com/recipes/491261/

나는 항상 HTTPLIB2를 사용하는 사이에 찢어졌습니다. HTTPLIB2는 HTTP 캐싱 및 인증을 처리하는 견고한 작업을 수행했으며 STDLIB에있는 urllib2는 확장 가능한 인터페이스를 가지고 있으며 HTTP 프록시 서버를 지원합니다.

그만큼 레시피를 활성화하십시오 Urllib2에 캐싱 지지대를 추가하기 시작하지만 매우 원시적 인 방식으로 만 있습니다. 스토리지 메커니즘의 확장 성을 허용하지 않아 파일 시스템 지원 스토리지를 최저 코딩합니다. 또한 HTTP 캐시 헤더를 존중하지 않습니다.

httplib2 캐싱 및 urllib2 Extensibility의 최상의 특징을 모으기 위해 httplib2에서 발견되는 것과 동일한 캐싱 기능의 대부분을 구현하기 위해 활성화 레시피를 조정했습니다. 모듈은 jaraco.net에 있습니다 jaraco.net.http.caching. 링크는이 글을 쓰는 시점에 존재하는 모듈을 가리 킵니다. 해당 모듈은 현재 더 큰 Jaraco.net 패키지의 일부이지만 패키지 내 종속성이 없으므로 모듈을 꺼내 자체 프로젝트에서 사용하십시오.

또는 2.6 이상 파이썬이있는 경우 easy_install jaraco.net>=1.3 그런 다음 코드와 같은 캐싱 핸들러를 사용하십시오. caching.quick_test().

"""Quick test/example of CacheHandler"""
import logging
import urllib2
from httplib2 import FileCache
from jaraco.net.http.caching import CacheHandler

logging.basicConfig(level=logging.DEBUG)
store = FileCache(".cache")
opener = urllib2.build_opener(CacheHandler(store))
urllib2.install_opener(opener)
response = opener.open("http://www.google.com/")
print response.headers
print "Response:", response.read()[:100], '...\n'

response.reload(store)
print response.headers
print "After reload:", response.read()[:100], '...\n'

jaraco.util.http.caching은 캐시의 백킹 스토어에 대한 사양을 제공하지 않고 대신 httplib2에서 사용하는 인터페이스를 따릅니다. 이러한 이유로, httplib2.filecache는 urllib2 및 캐시 핸들러와 직접 사용할 수 있습니다. 또한 httplib2 용으로 설계된 다른 백킹 캐시는 캐시 핸들러에서 사용할 수 있어야합니다.

나는 비슷한 것을 찾고 있었고 "레시피 491261 : urllib2의 캐싱 및 스로틀" Danivo가 게시 한 것. 문제는 i입니다 진짜 캐싱 코드를 싫어합니다 (많은 복제, os.path.join을 사용하는 대신 파일 경로를 수동으로 결합하는 많은 부분이 정적 메드, 비 매우 pep8'sih 및 내가 피하려고하는 다른 것들을 사용합니다).

코드는 약간 더 좋으며 (어쨌든 내 의견으로는) 기능적으로는 몇 가지 추가 기능이 있습니다. 주로 "재시체"방법 (예제 사용 여기서 보일 수 있습니다, 또는 if __name__ == "__main__": 코드 끝의 섹션).

최신 버전은 찾을 수 있습니다 http://github.com/dbr/tvdb_api/blob/master/cache.py, 그리고 나는 후손을 위해 여기에 붙여 넣을 것입니다 (응용 프로그램 특정 헤더가 제거 된 상태) :

#!/usr/bin/env python
"""
urllib2 caching handler
Modified from http://code.activestate.com/recipes/491261/ by dbr
"""

import os
import time
import httplib
import urllib2
import StringIO
from hashlib import md5

def calculate_cache_path(cache_location, url):
    """Checks if [cache_location]/[hash_of_url].headers and .body exist
    """
    thumb = md5(url).hexdigest()
    header = os.path.join(cache_location, thumb + ".headers")
    body = os.path.join(cache_location, thumb + ".body")
    return header, body

def check_cache_time(path, max_age):
    """Checks if a file has been created/modified in the [last max_age] seconds.
    False means the file is too old (or doesn't exist), True means it is
    up-to-date and valid"""
    if not os.path.isfile(path):
        return False
    cache_modified_time = os.stat(path).st_mtime
    time_now = time.time()
    if cache_modified_time < time_now - max_age:
        # Cache is old
        return False
    else:
        return True

def exists_in_cache(cache_location, url, max_age):
    """Returns if header AND body cache file exist (and are up-to-date)"""
    hpath, bpath = calculate_cache_path(cache_location, url)
    if os.path.exists(hpath) and os.path.exists(bpath):
        return(
            check_cache_time(hpath, max_age)
            and check_cache_time(bpath, max_age)
        )
    else:
        # File does not exist
        return False

def store_in_cache(cache_location, url, response):
    """Tries to store response in cache."""
    hpath, bpath = calculate_cache_path(cache_location, url)
    try:
        outf = open(hpath, "w")
        headers = str(response.info())
        outf.write(headers)
        outf.close()

        outf = open(bpath, "w")
        outf.write(response.read())
        outf.close()
    except IOError:
        return True
    else:
        return False

class CacheHandler(urllib2.BaseHandler):
    """Stores responses in a persistant on-disk cache.

    If a subsequent GET request is made for the same URL, the stored
    response is returned, saving time, resources and bandwidth
    """
    def __init__(self, cache_location, max_age = 21600):
        """The location of the cache directory"""
        self.max_age = max_age
        self.cache_location = cache_location
        if not os.path.exists(self.cache_location):
            os.mkdir(self.cache_location)

    def default_open(self, request):
        """Handles GET requests, if the response is cached it returns it
        """
        if request.get_method() is not "GET":
            return None # let the next handler try to handle the request

        if exists_in_cache(
            self.cache_location, request.get_full_url(), self.max_age
        ):
            return CachedResponse(
                self.cache_location,
                request.get_full_url(),
                set_cache_header = True
            )
        else:
            return None

    def http_response(self, request, response):
        """Gets a HTTP response, if it was a GET request and the status code
        starts with 2 (200 OK etc) it caches it and returns a CachedResponse
        """
        if (request.get_method() == "GET"
            and str(response.code).startswith("2")
        ):
            if 'x-local-cache' not in response.info():
                # Response is not cached
                set_cache_header = store_in_cache(
                    self.cache_location,
                    request.get_full_url(),
                    response
                )
            else:
                set_cache_header = True
            #end if x-cache in response

            return CachedResponse(
                self.cache_location,
                request.get_full_url(),
                set_cache_header = set_cache_header
            )
        else:
            return response

class CachedResponse(StringIO.StringIO):
    """An urllib2.response-like object for cached responses.

    To determine if a response is cached or coming directly from
    the network, check the x-local-cache header rather than the object type.
    """
    def __init__(self, cache_location, url, set_cache_header=True):
        self.cache_location = cache_location
        hpath, bpath = calculate_cache_path(cache_location, url)

        StringIO.StringIO.__init__(self, file(bpath).read())

        self.url     = url
        self.code    = 200
        self.msg     = "OK"
        headerbuf = file(hpath).read()
        if set_cache_header:
            headerbuf += "x-local-cache: %s\r\n" % (bpath)
        self.headers = httplib.HTTPMessage(StringIO.StringIO(headerbuf))

    def info(self):
        """Returns headers
        """
        return self.headers

    def geturl(self):
        """Returns original URL
        """
        return self.url

    def recache(self):
        new_request = urllib2.urlopen(self.url)
        set_cache_header = store_in_cache(
            self.cache_location,
            new_request.url,
            new_request
        )
        CachedResponse.__init__(self, self.cache_location, self.url, True)


if __name__ == "__main__":
    def main():
        """Quick test/example of CacheHandler"""
        opener = urllib2.build_opener(CacheHandler("/tmp/"))
        response = opener.open("http://google.com")
        print response.headers
        print "Response:", response.read()

        response.recache()
        print response.headers
        print "After recache:", response.read()
    main()

Yahoo Developer Network에 관한이 기사 - http://developer.yahoo.com/python/python-caching.html - urllib를 통해 메모리 또는 디스크로 만든 HTTP 호출을 캐시하는 방법을 설명합니다.

@DBR : https 응답 캐싱도 다음을 추가해야 할 수도 있습니다.

def https_response(self, request, response):
    return self.http_response(request,response)

라이센스 : CC-BY-SA ~와 함께 속성

제휴하지 않습니다 StackOverflow