How can I take a screenshot/image of a website using Python?

https://stackoverflow.com/questions/1197172

20-09-2019
|

Question

What I want to achieve is to get a website screenshot from any website in python.

Env: Linux

Solution

On the Mac, there's webkit2png and on Linux+KDE, you can use khtml2png. I've tried the former and it works quite well, and heard of the latter being put to use.

I recently came across QtWebKit which claims to be cross platform (Qt rolled WebKit into their library, I guess). But I've never tried it, so I can't tell you much more.

The QtWebKit links shows how to access from Python. You should be able to at least use subprocess to do the same with the others.

OTHER TIPS

Here is a simple solution using webkit: http://webscraping.com/blog/Webpage-screenshots-with-webkit/

import sys
import time
from PyQt4.QtCore import *
from PyQt4.QtGui import *
from PyQt4.QtWebKit import *

class Screenshot(QWebView):
    def __init__(self):
        self.app = QApplication(sys.argv)
        QWebView.__init__(self)
        self._loaded = False
        self.loadFinished.connect(self._loadFinished)

    def capture(self, url, output_file):
        self.load(QUrl(url))
        self.wait_load()
        # set to webpage size
        frame = self.page().mainFrame()
        self.page().setViewportSize(frame.contentsSize())
        # render image
        image = QImage(self.page().viewportSize(), QImage.Format_ARGB32)
        painter = QPainter(image)
        frame.render(painter)
        painter.end()
        print 'saving', output_file
        image.save(output_file)

    def wait_load(self, delay=0):
        # process app events until page loaded
        while not self._loaded:
            self.app.processEvents()
            time.sleep(delay)
        self._loaded = False

    def _loadFinished(self, result):
        self._loaded = True

s = Screenshot()
s.capture('http://webscraping.com', 'website.png')
s.capture('http://webscraping.com/blog', 'blog.png')

Here is my solution by grabbing help from various sources. It takes full web page screen capture and it crops it (optional) and generates thumbnail from the cropped image also. Following are the requirements:

Requirements:

Install NodeJS
Using Node's package manager install phantomjs: npm -g install phantomjs
Install selenium (in your virtualenv, if you are using that)
Install imageMagick
Add phantomjs to system path (on windows)

import os
from subprocess import Popen, PIPE
from selenium import webdriver

abspath = lambda *p: os.path.abspath(os.path.join(*p))
ROOT = abspath(os.path.dirname(__file__))


def execute_command(command):
    result = Popen(command, shell=True, stdout=PIPE).stdout.read()
    if len(result) > 0 and not result.isspace():
        raise Exception(result)


def do_screen_capturing(url, screen_path, width, height):
    print "Capturing screen.."
    driver = webdriver.PhantomJS()
    # it save service log file in same directory
    # if you want to have log file stored else where
    # initialize the webdriver.PhantomJS() as
    # driver = webdriver.PhantomJS(service_log_path='/var/log/phantomjs/ghostdriver.log')
    driver.set_script_timeout(30)
    if width and height:
        driver.set_window_size(width, height)
    driver.get(url)
    driver.save_screenshot(screen_path)


def do_crop(params):
    print "Croping captured image.."
    command = [
        'convert',
        params['screen_path'],
        '-crop', '%sx%s+0+0' % (params['width'], params['height']),
        params['crop_path']
    ]
    execute_command(' '.join(command))


def do_thumbnail(params):
    print "Generating thumbnail from croped captured image.."
    command = [
        'convert',
        params['crop_path'],
        '-filter', 'Lanczos',
        '-thumbnail', '%sx%s' % (params['width'], params['height']),
        params['thumbnail_path']
    ]
    execute_command(' '.join(command))


def get_screen_shot(**kwargs):
    url = kwargs['url']
    width = int(kwargs.get('width', 1024)) # screen width to capture
    height = int(kwargs.get('height', 768)) # screen height to capture
    filename = kwargs.get('filename', 'screen.png') # file name e.g. screen.png
    path = kwargs.get('path', ROOT) # directory path to store screen

    crop = kwargs.get('crop', False) # crop the captured screen
    crop_width = int(kwargs.get('crop_width', width)) # the width of crop screen
    crop_height = int(kwargs.get('crop_height', height)) # the height of crop screen
    crop_replace = kwargs.get('crop_replace', False) # does crop image replace original screen capture?

    thumbnail = kwargs.get('thumbnail', False) # generate thumbnail from screen, requires crop=True
    thumbnail_width = int(kwargs.get('thumbnail_width', width)) # the width of thumbnail
    thumbnail_height = int(kwargs.get('thumbnail_height', height)) # the height of thumbnail
    thumbnail_replace = kwargs.get('thumbnail_replace', False) # does thumbnail image replace crop image?

    screen_path = abspath(path, filename)
    crop_path = thumbnail_path = screen_path

    if thumbnail and not crop:
        raise Exception, 'Thumnail generation requires crop image, set crop=True'

    do_screen_capturing(url, screen_path, width, height)

    if crop:
        if not crop_replace:
            crop_path = abspath(path, 'crop_'+filename)
        params = {
            'width': crop_width, 'height': crop_height,
            'crop_path': crop_path, 'screen_path': screen_path}
        do_crop(params)

        if thumbnail:
            if not thumbnail_replace:
                thumbnail_path = abspath(path, 'thumbnail_'+filename)
            params = {
                'width': thumbnail_width, 'height': thumbnail_height,
                'thumbnail_path': thumbnail_path, 'crop_path': crop_path}
            do_thumbnail(params)
    return screen_path, crop_path, thumbnail_path


if __name__ == '__main__':
    '''
        Requirements:
        Install NodeJS
        Using Node's package manager install phantomjs: npm -g install phantomjs
        install selenium (in your virtualenv, if you are using that)
        install imageMagick
        add phantomjs to system path (on windows)
    '''

    url = 'http://stackoverflow.com/questions/1197172/how-can-i-take-a-screenshot-image-of-a-website-using-python'
    screen_path, crop_path, thumbnail_path = get_screen_shot(
        url=url, filename='sof.png',
        crop=True, crop_replace=False,
        thumbnail=True, thumbnail_replace=False,
        thumbnail_width=200, thumbnail_height=150,
    )

These are the generated images:

I can't comment on ars's answer, but I actually got Roland Tapken's code running using QtWebkit and it works quite well.

Just wanted to confirm that what Roland posts on his blog works great on Ubuntu. Our production version ended up not using any of what he wrote but we are using the PyQt/QtWebKit bindings with much success.

can do using Selenium

from selenium import webdriver

DRIVER = 'chromedriver'
driver = webdriver.Chrome(DRIVER)
driver.get('https://www.spotify.com')
screenshot = driver.save_screenshot('my_screenshot.png')
driver.quit()

https://sites.google.com/a/chromium.org/chromedriver/getting-started

Using Rendertron is an option. Under the hood, this is a headless Chrome exposing the following endpoints:

/render/:url: Access this route e.g. with requests.get if you are interested in the DOM.
/screenshot/:url: Access this route if you are interested in a screenshot.

You would install rendertron with npm, run rendertron in one terminal, access http://localhost:3000/screenshot/:url and save the file, but a demo is available at render-tron.appspot.com making it possible to run this Python3 snippet locally without installing the npm package:

import requests

BASE = 'https://render-tron.appspot.com/screenshot/'
url = 'https://google.com'
path = 'target.jpg'
response = requests.get(BASE + url, stream=True)
# save file, see https://stackoverflow.com/a/13137873/7665691
if response.status_code == 200:
    with open(path, 'wb') as file:
        for chunk in response:
            file.write(chunk)

You don't mention what environment you're running in, which makes a big difference because there isn't a pure Python web browser that's capable of rendering HTML.

But if you're using a Mac, I've used webkit2png with great success. If not, as others have pointed out there are plenty of options.

Try this..

#!/usr/bin/env python

import gtk.gdk

import time

import random

while 1 :
    # generate a random time between 120 and 300 sec
    random_time = random.randrange(120,300)

    # wait between 120 and 300 seconds (or between 2 and 5 minutes)
    print "Next picture in: %.2f minutes" % (float(random_time) / 60)

    time.sleep(random_time)

    w = gtk.gdk.get_default_root_window()
    sz = w.get_size()

    print "The size of the window is %d x %d" % sz

    pb = gtk.gdk.Pixbuf(gtk.gdk.COLORSPACE_RGB,False,8,sz[0],sz[1])
    pb = pb.get_from_drawable(w,w.get_colormap(),0,0,0,0,sz[0],sz[1])

    ts = time.time()
    filename = "screenshot"
    filename += str(ts)
    filename += ".png"

    if (pb != None):
        pb.save(filename,"png")
        print "Screenshot saved to "+filename
    else:
        print "Unable to get the screenshot."

Licensed under: CC-BY-SA with attribution

Not affiliated with StackOverflow