Question

I have a small issue: I need for scrapy to loop a predetermined amount of times. The reason for this is that I am submitting a POST request and scraping the results. However, the results are not on one single page and thus needs to POST again with "cpipage" incremented. cpipage is the page number. Here is my spider code, I have changed the URL to nourl.com since this is not my website that I am scraping from.

from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.spider import Spider
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import Selector
from scrapy.http import FormRequest, Request
#from etmd.items import Etmditems
import scrapy

class EtmdSpider(Spider):
    name = "etmd"
    start_urls = ["http://b2.nourl.com/dp.asp"]
    def parse(self, response):
        url = "http://b2.nourl.com/dp.asp"
        payload = {"AppKey": "94921000e1999f84a518725", "ComparisonType1_1": "LIKE", "Value1_1": "", "MatchNull1_1" : "N", "ComparisonType2_1" : "LIKE", "MatchNull2_1" : "N", "Value2_1" : "", "ComparisonType3_1": "=", "MatchNull3_1" : "N", "Value3_1" : "", "x":"69", "y":"27", "FieldName1" : "County", "Operator1": "OR", "NumCriteriaDetails1": "1", "Operator2" : "OR", "NumCriteriaDetails2" : "1", "FieldName3": "Year", "Operator3" : "OR", "NumCriteriaDetails3": "1", "PageID" : "2", "GlobalOperator": "AND", "NumCriteria" : "3", "Search" : "1", "cpipage": "4"}
        return (FormRequest(url, formdata = payload, callback = self.parse_data))

    def parse_data(self, response):
        items = []
        sel = Selector(response)
        items.append(sel.xpath('//td').extract())

        exportfile = open( "exported.txt", "a")
        exportfile.write (str(items))

        print items

So within the payload dictionary I have the cpipage which in this case is "4" but I need it to increment all the way to 175. Is there anyway to do this within the code that I currently have or by running the scrapy spider though a script and no the shell?

I have already tried just a for loop:

for i in range(175):
    url = "http://b2.nourl.com/dp.asp"
    payload = {"AppKey": "94921000e1999f84a518725", "ComparisonType1_1": "LIKE", "Value1_1": "", "MatchNull1_1" : "N", "ComparisonType2_1" : "LIKE", "MatchNull2_1" : "N", "Value2_1" : "", "ComparisonType3_1": "=", "MatchNull3_1" : "N", "Value3_1" : "", "x":"69", "y":"27", "FieldName1" : "County", "Operator1": "OR", "NumCriteriaDetails1": "1", "Operator2" : "OR", "NumCriteriaDetails2" : "1", "FieldName3": "Year", "Operator3" : "OR", "NumCriteriaDetails3": "1", "PageID" : "2", "GlobalOperator": "AND", "NumCriteria" : "3", "Search" : "1", "cpipage": "%i" %i}
    return (FormRequest(url, formdata = payload, callback = self.parse_data))
Was it helpful?

Solution

return statement will exit the method immediately.

You should either return list of all requests:

def parse(self, response):
    requests = []
    for i in range(175):
        url = "http://b2.nourl.com/dp.asp"
        payload = {"AppKey": "94921000e1999f84a518725", "ComparisonType1_1": "LIKE", "Value1_1": "", "MatchNull1_1" : "N", "ComparisonType2_1" : "LIKE", "MatchNull2_1" : "N", "Value2_1" : "", "ComparisonType3_1": "=", "MatchNull3_1" : "N", "Value3_1" : "", "x":"69", "y":"27", "FieldName1" : "County", "Operator1": "OR", "NumCriteriaDetails1": "1", "Operator2" : "OR", "NumCriteriaDetails2" : "1", "FieldName3": "Year", "Operator3" : "OR", "NumCriteriaDetails3": "1", "PageID" : "2", "GlobalOperator": "AND", "NumCriteria" : "3", "Search" : "1", "cpipage": "%i" %i}
        requests.append(FormRequest(url, formdata = payload, callback = self.parse_data))
    return requests

or yield them one by one:

def parse(self, response):
    for i in range(175):
        url = "http://b2.nourl.com/dp.asp"
        payload = {"AppKey": "94921000e1999f84a518725", "ComparisonType1_1": "LIKE", "Value1_1": "", "MatchNull1_1" : "N", "ComparisonType2_1" : "LIKE", "MatchNull2_1" : "N", "Value2_1" : "", "ComparisonType3_1": "=", "MatchNull3_1" : "N", "Value3_1" : "", "x":"69", "y":"27", "FieldName1" : "County", "Operator1": "OR", "NumCriteriaDetails1": "1", "Operator2" : "OR", "NumCriteriaDetails2" : "1", "FieldName3": "Year", "Operator3" : "OR", "NumCriteriaDetails3": "1", "PageID" : "2", "GlobalOperator": "AND", "NumCriteria" : "3", "Search" : "1", "cpipage": "%i" %i}
        yield FormRequest(url, formdata = payload, callback = self.parse_data)
Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top