문제

I have a Python script which scrape data from a website(www.nowgoal.com). Since this webpage contains Javascript code,i'm using PyQt4 to render the page then convert it to Html and finally parse required data. All was working fine,but recently they added a Javascript alert message that prevents the page to being rendered correctly. By looking at the source page,at the bottom there is the Javascript function of the alert message:

`<script type ="text/javascript" >
if(getCookie("enurl_bak")==null)
{
 writeCookie("enurl_bak", "1");
if(confirm('Nowgoal.net is our spare link\n\n Please add to your favorites'))      {try{window.external.addFavorite('http://www.nowgoal.net','LiveScore - NowGoal.com');}catch(e)   {alert('Sorry, fail to add favorits. Your browser can\'t finish this operation. Please use Ctrl+D to    add.');}}
}
</script>`

At this point it seems to be enough to set a cookie(name=enurl_bak" ; value<>null) to skip the alert. The problem is that i do not know how to do it,I've looked everywhere but i was not able to find a real example on how to set cookies using PyQt4.

Here is what i'm using to render the webpage: from PyQt4.QtGui import * from PyQt4.QtCore import * from PyQt4.QtWebKit import * from PyQt4 import QtNetwork class Render(QWebPage): def __init__(self, url): self.app = QApplication(sys.argv) QWebPage.__init__(self) self.loadFinished.connect(self._loadFinished) self.mainFrame().page().setNetworkAccessManager(networkAccessManager) self.mainFrame().load(QUrl(url)) self.app.exec_() def _loadFinished(self, result): self.frame = self.mainFrame() self.app.quit() url = 'http://www.nowgoal.com' r = Render(url) html = r.frame.toHtml()

I have also tried the setHtml (from urllib2) instead of load(QUrl) PyQt4 method by removing the Javascript alert function without success.

도움이 되었습니까?

해결책 2

Yes!! Done it :)

from PyQt4.QtNetwork import QNetworkCookie, QNetworkCookieJar
from PyQt4.QtGui import *  
from PyQt4.QtCore import *  
from PyQt4.QtWebKit import *
import sys

class Render(QWebPage):  
    def __init__(self, url):  
        self.app = QApplication(sys.argv)  
        QWebPage.__init__(self)
        self.loadFinished.connect(self._loadFinished)  
        self.mainFrame().load(QUrl(url))
        self.cookie = QNetworkCookie()
        self.cookie.setDomain('.nowgoal.com')
        self.cookie.setName('enurl_bak')
        self.cookiejar = QNetworkCookieJar()
        self.cookiejar.setAllCookies([self.cookie])
        self.networkAccessManager().setCookieJar(self.cookiejar)
        self.app.exec_()  

    def _loadFinished(self, result):  
        self.frame = self.mainFrame()
        print"loadfinished"
        self.app.quit()

url = 'http://www.nowgoal.com'
Render(url)  

And again,thank you ekhumoro for putting me on the right way!

다른 팁

The test script below successfully sets and reads the cookie, preventing the alert message from being shown. However, this only works with the test.html page: for some unknown reason (webkit bug?), it doesn't work with the www.nowgoal.com website.

from PyQt4 import QtCore, QtGui, QtWebKit, QtNetwork

class WebPage(QtWebKit.QWebPage):
    def __init__(self):
        QtWebKit.QWebPage.__init__(self)
        self.cookies = QtNetwork.QNetworkCookieJar(self)
        self.cookies.setAllCookies(
            [QtNetwork.QNetworkCookie('enurl_bak', '1')])
        self.networkAccessManager().setCookieJar(self.cookies)
        self.mainFrame().loadFinished.connect(self.handleLoadFinished)

    def start(self, url):
        self.mainFrame().load(QtCore.QUrl(url))

    def handleLoadFinished(self):
        print('handleLoadFinished')
        QtGui.qApp.quit()

if __name__ == '__main__':

    import sys
    app = QtGui.QApplication(sys.argv)
    window = WebPage()
    window.start('test.html')
    sys.exit(app.exec_())

test.html:

<script type="text/javascript">
// from www.nowgoal.com (public.js)
function getCookie(name){
    var cname = name + "=";
    var dc = document.cookie;
    if (dc.length > 0){
        begin = dc.indexOf(cname);
        if (begin != -1){
            begin += cname.length;
            end = dc.indexOf(";", begin);
            if (end == -1) end = dc.length;
            return dc.substring(begin, end);
        }
    }
    return null;
}
if (getCookie('enurl_bak') == null) {
    alert('"enurl_bak" value is null');
}
</script>

UPDATE:

It seems there is no webkit bug: I just needed to set the domain, as per the answer by SkY3d.

라이센스 : CC-BY-SA ~와 함께 속성
제휴하지 않습니다 StackOverflow
scroll top