Only the first matching rule will be followed by CrawlSpider
:
If multiple rules match the same link, the first one will be used, according to the order they’re defined in this attribute.
You have a couple options:
Defining a generator callback using yield
rules = (
Rule(SgmlLinkExtractor(allow=r'.*'), callback='parse_items', follow=True),
)
def parse_items(self, response):
sel = Selector(response)
i = IdeeItem()
i['website'] = sel.xpath("""/html/body/table/tr[1]/td[2]/div[4]/div/table/tr[4]/td/h4/text()""").extract()
i['title'] = sel.xpath("""/html/body/table/tr[1]/td[2]/div[4]/div/table/tr[2]/td/h4/text()""").extract()
yield i
i = IdeeItem()
i['website'] = sel.xpath("""/html/body/table/tr[1]/td[2]/div[4]/div/table/tr[6]/td/h4/text()""").extract()
i['email'] = sel.xpath("""//html/body/table/tr[1]/td[2]/div[4]/div/table/tr[8]/td/a[1]/text()""").extract()
yield i
Or calling the 2 callbacks 1 after the other:
rules = (
Rule(SgmlLinkExtractor(allow=r'.*'), callback='parse_items', follow=True),
)
def parse_items(self, response, sel=None):
sel = Selector(response)
for r in self.parse_item1(response, sel):
yield r
for r in self.parse_item2(response, sel):
yield r
def parse_item1(self, response, sel=None):
if sel is None:
sel = Selector(response)
i = IdeeItem()
i['website'] = sel.xpath("""/html/body/table/tr[1]/td[2]/div[4]/div/table/tr[4]/td/h4/text()""").extract()
i['title'] = sel.xpath("""/html/body/table/tr[1]/td[2]/div[4]/div/table/tr[2]/td/h4/text()""").extract()
return i
def parse_item2(self, response, sel=None):
if sel is None:
sel = Selector(response)
i = IdeeItem()
i['website'] = sel.xpath("""/html/body/table/tr[1]/td[2]/div[4]/div/table/tr[6]/td/h4/text()""").extract()
i['email'] = sel.xpath("""//html/body/table/tr[1]/td[2]/div[4]/div/table/tr[8]/td/a[1]/text()""").extract()
return i