Question

Here is the script I am running:

//Require CasperJS
var casper = require('casper').create();

//Scraping Courserank
var base = "https://www.courserank.com";
var home = base + "/w/home";
var schools = base + "/w/schools?switchSchool=1";

//First, navigate to homepage and login
casper.start(home, function() {
    console.log('Logging in...');
    //Fill in the login form
    this.fill(
            'form[action="login"]', 
            { username : 'hatboysam@gmail.com', password : "****" },
            true
            );
});

function getSchools() {
    var arr = document.querySelectorAll('div.link');
    return arr;
}

//Go to the schools page
casper.then(function() {
    console.log(this.getCurrentUrl());
    //Open the school choice page
    casper.open(schools).then(function() {
        console.log(this.getCurrentUrl());
        //Get all school links
        var schools_arr = this.evaluate(getSchools);
        console.log(schools_arr.length);
        Array.prototype.map.call(schools_arr, function(elem) {
            console.log(elem.innerHTML);
        });
    });
});

casper.run();

Everything goes fine until the inner loop of the map call, specifically console.log(elem.innerHTML). Many of the elements in schools_arr are null. If I add an if(elem != null) { ... } around the console.log statement it's all fine but that defeats the point. When I run the same document.querySelectorAll in the Chrome console on the page none of the 513 elements in the NodeList are null. CasperJS also reports 513 elements, but it shows many as null. What's going on here? Is the page not loading fully? I have never used CasperJS before so sorry if this is a newbie mistake.

Was it helpful?

Solution

You can't return native node elements from the page context using evaluate(); you have to Array#map to something deserializable with JSON.parse.

So your getSchools() function should be doing something like:

function getSchools() {
    var arr = document.querySelectorAll('div.link');
    return Array.prototype.map.call(arr, function(elem) {
        return elem.innerHTML;
    });
}

Though I don't know what you can do with the node innerHTML string content… So it's usually better to map elements to their exact properties you need:

function getSchools() {
    var arr = document.querySelectorAll('div.link a');
    return Array.prototype.map.call(arr, function(elem) {
        return elem.getAttribute('href');
    });
}

Edit: as requested in the comments, to fetch inner text for all links:

function getSchools() {
    var arr = document.querySelectorAll('div.link a');
    return Array.prototype.map.call(arr, function(elem) {
        return elem.textContent;
    });
}
Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top