Code suggestion after @agstudy published answer.
- connect to signals following example from http://doc.scrapy.org/en/latest/topics/extensions.html#sample-extension
- you already save settings in
__init__
, so might as well use them afterwards, - not use
from scrapy.conf import settings
- added
_port
option - (cosmetic) changed
self.cb
toself.couchbase
not to confuse with "callback"
See below:
from scrapy import signals
from couchbase.exceptions import CouchbaseError
from couchbase import Couchbase
class CouchbaseStore(object):
@classmethod
def from_crawler(cls, crawler):
o = cls(crawler.settings)
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
return o
def __init__(self, settings):
self._server = settings.get('COUCHBASE_SERVER')
self._port = settings.get('COUCHBASE_PORT', 8091)
self._bucket = settings.get('COUCHBASE_BUCKET')
self._password = settings.get('COUCHBASE_PASSWORD')
def process_item(self, item, spider):
data = {}
for key in item.keys():
if isinstance(item[key], datetime):
data[key] = item[key].isoformat()
else:
data[key] = item[key]
## I assume item have a unique time field
key = "{0}".format(item['time'].isoformat())
self.couchbase.set(key, data)
log.msg("Item with key % s stored in bucket %s/ node %s" %
(key, self._bucket, self._server),
level=log.INFO, spider=spider)
return item
def spider_opened(self, spider):
try:
self.couchbase = Couchbase.connect(bucket=self._bucket,
host=self._server,
post=self._port,
password=self._password)
except CouchbaseError:
log.msg('Connection problem to bucket %s'% self._bucket,
log.ERROR)
log.msg("CouchbaseStore.spider_opened called", level=log.DEBUG)
def spider_closed(self, spider):
self.couchbase._close()
log.msg("CouchbaseStore.spider_closed called", level=log.DEBUG)