Scrapy spider not including all requested pages

Question 1

The final answer did indeed lie in the indentation of one single yield line. This is the code that ended up doing what I needed it to do.

from scrapy.spider import Spider
from scrapy.selector import Selector
from scrapy.http import Request
import re

from yelp2.items import YelpReviewItem

RESTAURANTS = ['sixteen-chicago']

def createRestaurantPageLinks(self, response):
    reviewsPerPage = 40
    sel = Selector(response)
    totalReviews = int(sel.xpath('//div[@class="rating-info clearfix"]//span[@itemprop="reviewCount"]/text()').extract()[0].strip().split(' ')[0])
    pages = [Request(url=response.url + '?start=' + str(reviewsPerPage*(n+1)), callback=self.parse) for n in range(totalReviews/reviewsPerPage)]
    return pages

class YelpXSpider(Spider):
    name = "yelpx"
    allowed_domains = ["yelp.com"]
    start_urls = ['http://www.yelp.com/biz/%s' % s for s in RESTAURANTS]

    def parse(self, response):
        requests = []

        sel = Selector(response)
        reviews = sel.xpath('//div[@class="review review-with-no-actions"]')
        items = []

        for review in reviews:
            item = YelpReviewItem()
            item['venueName'] = sel.xpath('//meta[@property="og:title"]/@content').extract()
            item['reviewer'] = review.xpath('.//li[@class="user-name"]/a/text()').extract()
            item['reviewerLoc'] = review.xpath('.//li[@class="user-location"]/b/text()').extract()
            item['rating'] = review.xpath('.//meta[@itemprop="ratingValue"]/@content').extract()
            item['reviewDate'] = review.xpath('.//meta[@itemprop="datePublished"]/@content').extract()
            item['reviewText'] = review.xpath('.//p[@itemprop="description"]/text()').extract()
            item['url'] = response.url
            yield item

        if response.url.find('?start=') == -1:
            requests += createRestaurantPageLinks(self, response)
            for request in requests:
                    yield request

Thanks to everyone for helping out a noob!

Question 2

You need to reorganize the methods a bit. First parse restaurant page in parse() method. Then, return requests for reviews and handle responses in another method, e.g. parse_review():

import re

from scrapy.item import Item, Field
from scrapy.spider import Spider
from scrapy.selector import Selector
from scrapy.http import Request

from yelp2.items import YelpReviewItem


RESTAURANTS = ['sixteen-chicago']

class Yelp2aSpider(Spider):
    name = "yelp2a"
    allowed_domains = ["yelp.com"]
    start_urls = ['http://www.yelp.com/biz/%s' % s for s in RESTAURANTS]

    def parse(self, response):
        reviewsPerPage = 40
        sel = Selector(response)
        totalReviews = int(sel.xpath('//div[@class="rating-info clearfix"]//span[@itemprop="reviewCount"]/text()').extract()[0].strip().split(' ')[0])
        pages = [Request(url=response.url + '?start=' + str(reviewsPerPage*(n+1)), callback=self.parse_review) for n in range(totalReviews/reviewsPerPage)]
        return pages

    def parse_review(self, response):
        sel = Selector(response)
        reviews = sel.xpath('//div[@class="review review-with-no-actions"]')
        for review in reviews:
            item = YelpReviewItem()
            item['venueName'] = sel.xpath('//meta[@property="og:title"]/@content').extract()
            item['reviewer'] = review.xpath('.//li[@class="user-name"]/a/text()').extract()
            item['reviewerLoc'] = review.xpath('.//li[@class="user-location"]/b/text()').extract()
            item['rating'] = review.xpath('.//meta[@itemprop="ratingValue"]/@content').extract()
            item['reviewDate'] = review.xpath('.//meta[@itemprop="datePublished"]/@content').extract()
            item['reviewText'] = review.xpath('.//p[@itemprop="description"]/text()').extract()
            item['url'] = response.url
            yield item

Question 3

If you're returning items/requests in more than one place, you should replace your return statements with yield statements, which turn your function into a generator, which returns a new element each time it's generated (yields it), without exiting the function until they are all returned. Otherwise, as your code is now, your function will exit after the first return and won't get to sending the requests for the following pages.

Edit: Correction - you should yield one item/request at a time, so:

Replace

for review in reviews:
    item = ...
return items

with

for review in reviews:
    item = ...
    yield item

and replace

return requests

with

for request in requests:
    yield request