Question

I have a Python script which scrape data from a website(www.nowgoal.com). Since this webpage contains Javascript code,i'm using PyQt4 to render the page then convert it to Html and finally parse required data. All was working fine,but recently they added a Javascript alert message that prevents the page to being rendered correctly. By looking at the source page,at the bottom there is the Javascript function of the alert message:

`<script type ="text/javascript" >
if(getCookie("enurl_bak")==null)
{
 writeCookie("enurl_bak", "1");
if(confirm('Nowgoal.net is our spare link\n\n Please add to your favorites'))      {try{window.external.addFavorite('http://www.nowgoal.net','LiveScore - NowGoal.com');}catch(e)   {alert('Sorry, fail to add favorits. Your browser can\'t finish this operation. Please use Ctrl+D to    add.');}}
}
</script>`

At this point it seems to be enough to set a cookie(name=enurl_bak" ; value<>null) to skip the alert. The problem is that i do not know how to do it,I've looked everywhere but i was not able to find a real example on how to set cookies using PyQt4.

Here is what i'm using to render the webpage: from PyQt4.QtGui import * from PyQt4.QtCore import * from PyQt4.QtWebKit import * from PyQt4 import QtNetwork class Render(QWebPage): def __init__(self, url): self.app = QApplication(sys.argv) QWebPage.__init__(self) self.loadFinished.connect(self._loadFinished) self.mainFrame().page().setNetworkAccessManager(networkAccessManager) self.mainFrame().load(QUrl(url)) self.app.exec_() def _loadFinished(self, result): self.frame = self.mainFrame() self.app.quit() url = 'http://www.nowgoal.com' r = Render(url) html = r.frame.toHtml()

I have also tried the setHtml (from urllib2) instead of load(QUrl) PyQt4 method by removing the Javascript alert function without success.

Was it helpful?

Solution 2

Yes!! Done it :)

from PyQt4.QtNetwork import QNetworkCookie, QNetworkCookieJar
from PyQt4.QtGui import *  
from PyQt4.QtCore import *  
from PyQt4.QtWebKit import *
import sys

class Render(QWebPage):  
    def __init__(self, url):  
        self.app = QApplication(sys.argv)  
        QWebPage.__init__(self)
        self.loadFinished.connect(self._loadFinished)  
        self.mainFrame().load(QUrl(url))
        self.cookie = QNetworkCookie()
        self.cookie.setDomain('.nowgoal.com')
        self.cookie.setName('enurl_bak')
        self.cookiejar = QNetworkCookieJar()
        self.cookiejar.setAllCookies([self.cookie])
        self.networkAccessManager().setCookieJar(self.cookiejar)
        self.app.exec_()  

    def _loadFinished(self, result):  
        self.frame = self.mainFrame()
        print"loadfinished"
        self.app.quit()

url = 'http://www.nowgoal.com'
Render(url)  

And again,thank you ekhumoro for putting me on the right way!

OTHER TIPS

The test script below successfully sets and reads the cookie, preventing the alert message from being shown. However, this only works with the test.html page: for some unknown reason (webkit bug?), it doesn't work with the www.nowgoal.com website.

from PyQt4 import QtCore, QtGui, QtWebKit, QtNetwork

class WebPage(QtWebKit.QWebPage):
    def __init__(self):
        QtWebKit.QWebPage.__init__(self)
        self.cookies = QtNetwork.QNetworkCookieJar(self)
        self.cookies.setAllCookies(
            [QtNetwork.QNetworkCookie('enurl_bak', '1')])
        self.networkAccessManager().setCookieJar(self.cookies)
        self.mainFrame().loadFinished.connect(self.handleLoadFinished)

    def start(self, url):
        self.mainFrame().load(QtCore.QUrl(url))

    def handleLoadFinished(self):
        print('handleLoadFinished')
        QtGui.qApp.quit()

if __name__ == '__main__':

    import sys
    app = QtGui.QApplication(sys.argv)
    window = WebPage()
    window.start('test.html')
    sys.exit(app.exec_())

test.html:

<script type="text/javascript">
// from www.nowgoal.com (public.js)
function getCookie(name){
    var cname = name + "=";
    var dc = document.cookie;
    if (dc.length > 0){
        begin = dc.indexOf(cname);
        if (begin != -1){
            begin += cname.length;
            end = dc.indexOf(";", begin);
            if (end == -1) end = dc.length;
            return dc.substring(begin, end);
        }
    }
    return null;
}
if (getCookie('enurl_bak') == null) {
    alert('"enurl_bak" value is null');
}
</script>

UPDATE:

It seems there is no webkit bug: I just needed to set the domain, as per the answer by SkY3d.

Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top