Question

I'm trying to scrap some PubMed data using cheerio. The following script works ok but when some xml tag does not exist, it generates an erroneously ordered output.

var request = require('request'),
cheerio = require('cheerio');
request('http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=xml&id=23545583,23103438', 
  function(error, response, body) {
    var $ = cheerio.load(body);
    for (var i = 0; i < $('PubmedArticle').length; i++) {
        console.log($('PubmedArticle PMID').slice(0).eq(i).text());
        console.log($('PubmedArticle DateCreated Year').slice(0).eq(i).text());
        console.log($('PubmedArticle ArticleTitle').slice(0).eq(i).text());
        console.log($('PubmedArticle Abstract AbstractText').slice(0).eq(i).text());
      };
});

In this example the abstract outputs below the first title instead second because first article does not contain abstract.

Was it helpful?

Solution

Finally, I think I could overcome this using a different strategy:

var $ = require('cheerio')
var request = require('request')

function gotXML(err, resp, xml) {
  if (err) return console.error(err)
  var parsedXML = $.load(xml)
  parsedXML('PubmedArticle').map(function(i, article) {
    console.log($(article).find('pmid')[0].children[0].data);
    console.log($(article).find('articletitle')[0].children[0].data);
    console.log($(article).find('datecreated year')[0].children[0].data);
    if ($(article).find('abstracttext').length>0) {
      console.log($(article).find('abstracttext')[0].children[0].data);
    };
  }
  );
}

var domain = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=xml&id=23545583,23103438';
request(domain, gotXML);
Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top