first off : I'm new to node, and a relative programming beginner.
I'm trying to create a small web app with Express, whose only goal is to fetch and reformat data from a website that doesn't have an open API.
To do so, I've decided to learn about scraping, and that brought me to Cheerio and Request.
I'm using reddit as an example, to learn on. The end goal in this example is to gather the name and href of the posts on the front page as well as the url leading to the comments, then to go on that page to scrape the number of comments.
What follows is the route that is called on a GET request to / (please excuse the variable names, and the comments/console.logs, I got frustrated) :
/*
* GET home page.
*/
exports.index = function(req, res){
var request = require('request')
, cheerio =require('cheerio')
, mainArr = []
, test = "test"
, uI
, commentURL;
function first() {
request("http://www.reddit.com", function(err, resp, body) {
if (!err && resp.statusCode == 200) {
var $ = cheerio.load(body);
$('.thing', '#siteTable').each(function(){
var url = $('a.title', this).attr('href')
, title = $('a.title', this).html()
, commentsLink = $('a.comments', this).attr('href')
, arr = [];
arr.push(title);
arr.push(url);
arr.push(commentsLink);
mainArr.push(arr);
});
second();
};
});
}
function second() {
for (i = mainArr.length - 1; i >= 0; i--) {
uI = mainArr[i].length - 1;
commentURL = mainArr[i][uI];
console.log(commentURL + ", " + uI + ", " + i);
var foo = commentURL;
request(foo, function(err, resp, body) {
console.log("what the shit");
// var $ = cheerio.load(body);
// console.log(mainArr.length + ", " + commentURL + ", " + i + ", " + uI);
// var test = $('span.title', 'div.content').html();
console.log(test + ", "+ foo + ", " + commentURL + ", " + i + ", " + uI);
// mainArr[1][2] = test;
});
};
if (i<=0) {
res.render('index', {title: test});
};
}
first();
};
The function first(); works as intended. It puts the title, the href and url to the comments in an array, then pushes that array in a master array containing those data points for all of the posts on the front page. It then calls the function second();
Said function's goal is to loop through the master array (mainArr[]), then select all of the urls leading to comments (mainArr[i][uI]) and launch a request() with that url as first parameter.
The loop works, but during the second call of request() inside the second() function, everything breaks down. The variable i gets set permanently at -1, and commentURL (the variable that is set to the URL of the comments of the current post), is defined permanently as the first url in arrMain[]. There are also weird behaviors with arrMain.length. Depending on where I place it, it tells me that arrMain is undefined.
I have a feeling that I'm missing something obvious (probably to do with asynchronicity), but for the life of me, I can't find it.
I would be really greatful for any suggestions!