Pergunta

I would like to download the dynamic generated image from a website. The website is has javascript code and click button to turn to previous image and next image. I inspected the http request and response in chrome. The request is almost the same except the image name(it is numerically increased like:000001.jpg,000002.jpg). Now I can access the first image and save it to disk by subclassing QWebView with a customized QNetworkAccessManager. I overload the createRequest function:

import sys,urllib,time,os
from PyQt4.QtCore import *
from PyQt4.QtGui import *
from PyQt4.QtWebKit import *
from PyQt4.QtNetwork import *
from PIL import Image

class NetworkAccessManager(QNetworkAccessManager):
    def __init__(self,old_manager):
    QNetworkAccessManager.__init__(self)
    self.old_manager = old_manager
    self.setCache(old_manager.cache())
    self.setCookieJar(old_manager.cookieJar())
    self.setProxy(old_manager.proxy())
    self.setProxyFactory(old_manager.proxyFactory())
    self.imreply=None
    self.reqstr=None
    self.otherreply=None
    self.current_req=None
    self.cnt=0
    self.jpgName="test.jpg"
    self.first=True
    self.ba=QByteArray()
    self.ba.clear()

    def createRequest(self, operation, request, data):
        req = request.url().toString()
        if req.contains(QString("zoom=")) and req.contains(QString("ss2jpg")) and not req.contains(QString("pi=2")):
            strreq=str(req)
            l=strreq.find("jid=")
            r=strreq.find(".jpg&a")
            self.jpgName=strreq[l+5:r+4]
            self.jpgcnt=int(strreq[l+5:r])
            print self.jpgName,self.jpgcnt
            self.imreply=QNetworkAccessManager.createRequest(self,operation, request, data)
            self.connect(self.imreply,SIGNAL("readyRead()"),self.saveImage)
            return self.imreply
        elif req.contains(QString("uf=ssr")):
            strreq=str(req)
            self.reqstr=strreq
            self.current_req=request
            r=strreq.find("?")
            self.jpgcnt=int(strreq[r-6:r])
            self.otherreply=QNetworkAccessManager.createRequest(self,operation, request, data)
            return self.otherreply
        else:
            return QNetworkAccessManager.createRequest(self,operation, request, data)

    def saveImage(self):
        if self.imreply.header(QNetworkRequest.ContentTypeHeader).toString().contains(QString("image/jpeg")) or self.imreply.header(QNetworkRequest.ContentTypeHeader).toString().contains(QString("image/png")):
            contentLen,flag = QString(self.imreply.rawHeader("Content-Length")).toInt()
            self.ba=self.ba.append(self.imreply.readAll())
            if self.ba.size() == contentLen:
            #self.ba=self.imreply.readAll()
            im=QImage.fromData(self.ba)
            im.save(self.jpgName)
            im=Image.open(self.jpgName)
            print "saving image",contentLen,self.jpgName
            im.save(self.jpgName)
            self.ba.clear()
            self.emit(SIGNAL("nextPage()"))

class dxWebView(QWebView):
    def __init__(self):
        QWebView.__init__(self)

    def clickNext(self):
        manager=self.page().networkAccessManager()
        if manager.cnt<50:
            nextreq=manager.current_req
            nexturl=manager.reqstr.replace(str(manager.jpgcnt),str(manager.jpgcnt+1))
            print "next url",nexturl
            nextreq.setUrl(QUrl(nexturl))
            manager.get(QNetworkRequest(nextreq))
            manager.cnt=manager.cnt+1

def main():
    app=QApplication(sys.argv)
    QWebSettings.globalSettings().setAttribute(QWebSettings.PluginsEnabled, True);
    view=dxWebView()
    old_manager=view.page().networkAccessManager()
    new_manager=NetworkAccessManager(old_manager)
    view.page().setNetworkAccessManager(new_manager)
    QObject.connect(new_manager,SIGNAL("nextPage()"),view.clickNext)
    url="http://www.yishuleia.cn/DrsPath.do?kid=686A67696A6F6A673134343438303337&username=gdnz2&spagenum=201&pages=50&fid=14813857&a=3fc3e380601ced0f08749c964294120e&btime=2013-04-03&etime=2013-04-23&template=bookdsr1&firstdrs=http%3A%2F%2Fbook.duxiu.com%2FbookDetail.jsp%3FdxNumber%3D000008299393%26d%3D592DC22226A893A958A6578E7D039A43"
    view.load(QUrl(url))
    view.show()
    sys.exit(app.exec_())

if __name__=='__main__':
    main()

When the first image is saved, the clickNext is triggered and qnetworkaccessmanager send the next request.But I found the manager.get(nextreq) did not work.The http analyzer did not siffered any http request and response. Am I wrong in clickNext function? How to do this? Thanks!

Foi útil?

Solução

As such the QNetworkAccessManager is a part of the QWebPage object, and the createRequest() method is invoked whenever there is any request for a resource from the rendered HTML (and any javascript it contains). As per my understanding the clickNext() function won't really have access to the actual DOM of the webpage in the manner you require.

If your aim is to build an application that can download all of these pictures, you can run some simple javascript on the site that automatically clicks through to the 'Next' image. Then, as you have done, you watch for requests to load images in your overloaded createRequest() function.

Licenciado em: CC-BY-SA com atribuição
Não afiliado a StackOverflow
scroll top