Domanda

I have a set of images, and I would like to hash their data into an ID.

Currently I am doing this:

import hashlib
import uuid

def get_image_uuid(pil_img):
    # Read PIL image data
    img_bytes_ = pil_img.tobytes()
    # hash the bytes using sha1
    bytes_sha1 = hashlib.sha1(img_bytes_)
    hashbytes_20 = bytes_sha1.digest()
    # sha1 produces 20 bytes, but UUID requires 16 bytes
    hashbytes_16 = hashbytes_20[0:16]
    uuid_ = uuid.UUID(bytes=hashbytes_16)
    return uuid_

This reads all the pixel data in the image, which is overkill for a deterministic 16 byte UUID hash.

Is there a way to do something like this?

img_bytes = pil_img.tobytes(stride=16)

EDIT: I produced some detailed timing results using this script. I should mention that the images that I'm using are large (about 6MB). I tested on windows and linux:

from __future__ import absolute_import, division, print_function
import __builtin__
import time
import timeit
from PIL import Image
import hashlib
import numpy as np
import uuid

# My data getters
from vtool.tests import grabdata
elephant  = grabdata.get_testimg_path('elephant.jpg')
lena  = grabdata.get_testimg_path('lena.jpg')
zebra = grabdata.get_testimg_path('zebra.jpg')
jeff  = grabdata.get_testimg_path('jeff.png')
gpath = elephant


try:
    getattr(__builtin__, 'profile')
    __LINE_PROFILE__ = True
except AttributeError:
    __LINE_PROFILE__ = False
    def profile(func):
        return func


@profile
def get_image_uuid(img_bytes_):
    # hash the bytes using sha1
    bytes_sha1 = hashlib.sha1(img_bytes_)
    hashbytes_20 = bytes_sha1.digest()
    # sha1 produces 20 bytes, but UUID requires 16 bytes
    hashbytes_16 = hashbytes_20[0:16]
    uuid_ = uuid.UUID(bytes=hashbytes_16)
    return uuid_


@profile
def make_uuid_PIL_bytes(gpath):
    pil_img = Image.open(gpath, 'r')
    # Read PIL image data
    img_bytes_ = pil_img.tobytes()
    uuid_ = get_image_uuid(img_bytes_)
    return uuid_


@profile
def make_uuid_NUMPY_bytes(gpath):
    pil_img = Image.open(gpath, 'r')
    # Read PIL image data
    np_img = np.asarray(pil_img)
    np_flat = np_img.ravel()
    img_bytes_ = np_flat.tostring()
    uuid_ = get_image_uuid(img_bytes_)
    return uuid_


@profile
def make_uuid_NUMPY_STRIDE_16_bytes(gpath):
    pil_img = Image.open(gpath, 'r')
    # Read PIL image data
    np_img = np.asarray(pil_img)
    np_flat = np_img.ravel()[::16]
    img_bytes_ = np_flat.tostring()
    uuid_ = get_image_uuid(img_bytes_)
    return uuid_


@profile
def make_uuid_NUMPY_STRIDE_64_bytes(gpath):
    pil_img = Image.open(gpath, 'r')
    # Read PIL image data
    img_bytes_ = np.asarray(pil_img).ravel()[::64].tostring()
    uuid_ = get_image_uuid(img_bytes_)
    return uuid_


@profile
def make_uuid_CONTIG_NUMPY_bytes(gpath):
    pil_img = Image.open(gpath, 'r')
    # Read PIL image data
    np_img = np.asarray(pil_img)
    np_flat = np_img.ravel().tostring()
    np_contig = np.ascontiguousarray(np_flat)
    img_bytes_ = np_contig.tostring()
    uuid_ = get_image_uuid(img_bytes_)
    return uuid_


@profile
def make_uuid_CONTIG_NUMPY_STRIDE_16_bytes(gpath):
    pil_img = Image.open(gpath, 'r')
    # Read PIL image data
    np_img = np.asarray(pil_img)
    np_contig = np.ascontiguousarray(np_img.ravel()[::16])
    img_bytes_ = np_contig.tostring()
    uuid_ = get_image_uuid(img_bytes_)
    return uuid_


@profile
def make_uuid_CONTIG_NUMPY_STRIDE_64_bytes(gpath):
    pil_img = Image.open(gpath, 'r')
    # Read PIL image data
    img_bytes_ = np.ascontiguousarray(np.asarray(pil_img).ravel()[::64]).tostring()
    uuid_ = get_image_uuid(img_bytes_)
    return uuid_


if __name__ == '__main__':
    # cool trick
    test_funcs = [
        make_uuid_PIL_bytes,
        make_uuid_NUMPY_bytes,
        make_uuid_NUMPY_STRIDE_16_bytes,
        make_uuid_NUMPY_STRIDE_64_bytes,
        make_uuid_CONTIG_NUMPY_bytes,
        make_uuid_CONTIG_NUMPY_STRIDE_16_bytes,
        make_uuid_CONTIG_NUMPY_STRIDE_64_bytes,
    ]
    func_strs = ', '.join([func.func_name for func in test_funcs])
    setup = 'from __main__ import (gpath, %s) ' % (func_strs,)

    number = 2

    for func in test_funcs:
        func_name = func.func_name
        print('Running: %s' % func_name)
        if __LINE_PROFILE__:
            start = time.time()
            for _ in xrange(number):
                func(gpath)
            total_time = time.time() - start
        else:
            stmt = '%s(gpath)' % func_name
            total_time = timeit.timeit(stmt=stmt, setup=setup, number=number)
        print('timed: %r seconds in %s' % (total_time, func_name))

Here are the windows line profile results:

File: _timeits/time_uuids.py
Function: make_uuid_CONTIG_NUMPY_STRIDE_16_bytes at line 91
Total time: 1.03287 s

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
    91                                           @profile
    92                                           def make_uuid_CONTIG_NUMPY_STRIDE_16_bytes(gpath):
    93         2         3571   1785.5      0.1      pil_img = Image.open(gpath, 'r')
    94                                               # Read PIL image data
    95         2      3310103 1655051.5     96.2      np_img = np.asarray(pil_img)
    96         2        44833  22416.5      1.3      np_contig = np.ascontiguousarray(np_img.ravel()
[::16])
    97         2         9657   4828.5      0.3      img_bytes_ = np_contig.tostring()
    98         2        72560  36280.0      2.1      uuid_ = get_image_uuid(img_bytes_)
    99         2            4      2.0      0.0      return uuid_


File: _timeits/time_uuids.py
Function: make_uuid_CONTIG_NUMPY_STRIDE_64_bytes at line 102
Total time: 1.0385 s

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
   102                                           @profile
   103                                           def make_uuid_CONTIG_NUMPY_STRIDE_64_bytes(gpath):
   104         2         3285   1642.5      0.1      pil_img = Image.open(gpath, 'r')
   105                                               # Read PIL image data
   106         2      3436641 1718320.5     99.3      img_bytes_ = np.ascontiguousarray(np.asarray(p
il_img).ravel()[::64]).tostring()
   107         2        19570   9785.0      0.6      uuid_ = get_image_uuid(img_bytes_)
   108         2            4      2.0      0.0      return uuid_


File: _timeits/time_uuids.py
Function: make_uuid_NUMPY_STRIDE_64_bytes at line 70
Total time: 1.04175 s

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
    70                                           @profile
    71                                           def make_uuid_NUMPY_STRIDE_64_bytes(gpath):
    72         2         3356   1678.0      0.1      pil_img = Image.open(gpath, 'r')
    73                                               # Read PIL image data
    74         2      3447197 1723598.5     99.3      img_bytes_ = np.asarray(pil_img).ravel()[::64]
.tostring()
    75         2        19774   9887.0      0.6      uuid_ = get_image_uuid(img_bytes_)
    76         2            4      2.0      0.0      return uuid_


File: _timeits/time_uuids.py
Function: make_uuid_NUMPY_STRIDE_16_bytes at line 59
Total time: 1.0913 s

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
    59                                           @profile
    60                                           def make_uuid_NUMPY_STRIDE_16_bytes(gpath):
    61         2         3706   1853.0      0.1      pil_img = Image.open(gpath, 'r')
    62                                               # Read PIL image data
    63         2      3339663 1669831.5     91.9      np_img = np.asarray(pil_img)
    64         2          112     56.0      0.0      np_flat = np_img.ravel()[::16]
    65         2       217844 108922.0      6.0      img_bytes_ = np_flat.tostring()
    66         2        74044  37022.0      2.0      uuid_ = get_image_uuid(img_bytes_)
    67         2            4      2.0      0.0      return uuid_


File: _timeits/time_uuids.py
Function: get_image_uuid at line 28
Total time: 1.10141 s

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
    28                                           @profile
    29                                           def get_image_uuid(img_bytes_):
    30                                               # hash the bytes using sha1
    31        14      3665965 261854.6     99.9      bytes_sha1 = hashlib.sha1(img_bytes_)
    32        14          326     23.3      0.0      hashbytes_20 = bytes_sha1.digest()
    33                                               # sha1 produces 20 bytes, but UUID requires 16
bytes
    34        14           75      5.4      0.0      hashbytes_16 = hashbytes_20[0:16]
    35        14         2661    190.1      0.1      uuid_ = uuid.UUID(bytes=hashbytes_16)
    36        14           40      2.9      0.0      return uuid_


File: _timeits/time_uuids.py
Function: make_uuid_PIL_bytes at line 39
Total time: 1.33926 s

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
    39                                           @profile
    40                                           def make_uuid_PIL_bytes(gpath):
    41         2        25940  12970.0      0.6      pil_img = Image.open(gpath, 'r')
    42                                               # Read PIL image data
    43         2      3277455 1638727.5     73.5      img_bytes_ = pil_img.tobytes()
    44         2      1158009 579004.5     26.0      uuid_ = get_image_uuid(img_bytes_)
    45         2            4      2.0      0.0      return uuid_


File: _timeits/time_uuids.py
Function: make_uuid_NUMPY_bytes at line 48
Total time: 1.39694 s

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
    48                                           @profile
    49                                           def make_uuid_NUMPY_bytes(gpath):
    50         2         3406   1703.0      0.1      pil_img = Image.open(gpath, 'r')
    51                                               # Read PIL image data
    52         2      3344608 1672304.0     71.9      np_img = np.asarray(pil_img)
    53         2           46     23.0      0.0      np_flat = np_img.ravel()
    54         2       133593  66796.5      2.9      img_bytes_ = np_flat.tostring()
    55         2      1171888 585944.0     25.2      uuid_ = get_image_uuid(img_bytes_)
    56         2            5      2.5      0.0      return uuid_


File: _timeits/time_uuids.py
Function: make_uuid_CONTIG_NUMPY_bytes at line 79
Total time: 1.4899 s

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
    79                                           @profile
    80                                           def make_uuid_CONTIG_NUMPY_bytes(gpath):
    81         2         3384   1692.0      0.1      pil_img = Image.open(gpath, 'r')
    82                                               # Read PIL image data
    83         2      3376051 1688025.5     68.0      np_img = np.asarray(pil_img)
    84         2       133156  66578.0      2.7      np_flat = np_img.ravel().tostring()
    85         2       146959  73479.5      3.0      np_contig = np.ascontiguousarray(np_flat)
    86         2       149330  74665.0      3.0      img_bytes_ = np_contig.tostring()
    87         2      1154328 577164.0     23.3      uuid_ = get_image_uuid(img_bytes_)
    88         2            4      2.0      0.0      return uuid_

Here are the Linux line profile results:

File: _timeits/time_uuids.py
Function: make_uuid_NUMPY_STRIDE_64_bytes at line 70
Total time: 0.456272 s

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
    70                                           @profile
    71                                           def make_uuid_NUMPY_STRIDE_64_bytes(gpath):
    72         2          449    224.5      0.1      pil_img = Image.open(gpath, 'r')
    73                                               # Read PIL image data
    74         2       452880 226440.0     99.3      img_bytes_ = np.asarray(pil_img).ravel()[::64].
tostring()
    75         2         2942   1471.0      0.6      uuid_ = get_image_uuid(img_bytes_)
    76         2            1      0.5      0.0      return uuid_


File: _timeits/time_uuids.py
Function: make_uuid_CONTIG_NUMPY_STRIDE_64_bytes at line 102
Total time: 0.457588 s

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
   102                                           @profile
   103                                           def make_uuid_CONTIG_NUMPY_STRIDE_64_bytes(gpath):
   104         2          445    222.5      0.1      pil_img = Image.open(gpath, 'r')
   105                                               # Read PIL image data
   106         2       454269 227134.5     99.3      img_bytes_ = np.ascontiguousarray(np.asarray(pi
l_img).ravel()[::64]).tostring()
   107         2         2872   1436.0      0.6      uuid_ = get_image_uuid(img_bytes_)
   108         2            2      1.0      0.0      return uuid_


File: _timeits/time_uuids.py
Function: make_uuid_CONTIG_NUMPY_STRIDE_16_bytes at line 91
Total time: 0.461928 s

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
    91                                           @profile
    92                                           def make_uuid_CONTIG_NUMPY_STRIDE_16_bytes(gpath):
    93         2          482    241.0      0.1      pil_img = Image.open(gpath, 'r')
    94                                               # Read PIL image data
    95         2       436622 218311.0     94.5      np_img = np.asarray(pil_img)
    96         2        10990   5495.0      2.4      np_contig = np.ascontiguousarray(np_img.ravel()
[::16])
    97         2         2931   1465.5      0.6      img_bytes_ = np_contig.tostring()
    98         2        10902   5451.0      2.4      uuid_ = get_image_uuid(img_bytes_)
    99         2            1      0.5      0.0      return uuid_


File: _timeits/time_uuids.py
Function: make_uuid_NUMPY_STRIDE_16_bytes at line 59
Total time: 0.492819 s

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
    59                                           @profile
    60                                           def make_uuid_NUMPY_STRIDE_16_bytes(gpath):
    61         2          481    240.5      0.1      pil_img = Image.open(gpath, 'r')
    62                                               # Read PIL image data
    63         2       441343 220671.5     89.6      np_img = np.asarray(pil_img)
    64         2           34     17.0      0.0      np_flat = np_img.ravel()[::16]
    65         2        39996  19998.0      8.1      img_bytes_ = np_flat.tostring()
    66         2        10964   5482.0      2.2      uuid_ = get_image_uuid(img_bytes_)
    67         2            1      0.5      0.0      return uuid_


File: _timeits/time_uuids.py
Function: get_image_uuid at line 28
Total time: 0.545926 s

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
    28                                           @profile
    29                                           def get_image_uuid(img_bytes_):
    30                                               # hash the bytes using sha1
    31        14       545037  38931.2     99.8      bytes_sha1 = hashlib.sha1(img_bytes_)
    32        14          115      8.2      0.0      hashbytes_20 = bytes_sha1.digest()
    33                                               # sha1 produces 20 bytes, but UUID requires 16
bytes
    34        14           24      1.7      0.0      hashbytes_16 = hashbytes_20[0:16]
    35        14          742     53.0      0.1      uuid_ = uuid.UUID(bytes=hashbytes_16)
    36        14            8      0.6      0.0      return uuid_


File: _timeits/time_uuids.py
Function: make_uuid_PIL_bytes at line 39
Total time: 0.625736 s

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
    39                                           @profile
    40                                           def make_uuid_PIL_bytes(gpath):
    41         2         3915   1957.5      0.6      pil_img = Image.open(gpath, 'r')
    42                                               # Read PIL image data
    43         2       449092 224546.0     71.8      img_bytes_ = pil_img.tobytes()
    44         2       172728  86364.0     27.6      uuid_ = get_image_uuid(img_bytes_)
    45         2            1      0.5      0.0      return uuid_


File: _timeits/time_uuids.py
Function: make_uuid_NUMPY_bytes at line 48
Total time: 0.663057 s

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
    48                                           @profile
    49                                           def make_uuid_NUMPY_bytes(gpath):
    50         2          468    234.0      0.1      pil_img = Image.open(gpath, 'r')
    51                                               # Read PIL image data
    52         2       437346 218673.0     66.0      np_img = np.asarray(pil_img)
    53         2           18      9.0      0.0      np_flat = np_img.ravel()
    54         2        51512  25756.0      7.8      img_bytes_ = np_flat.tostring()
    55         2       173712  86856.0     26.2      uuid_ = get_image_uuid(img_bytes_)
    56         2            1      0.5      0.0      return uuid_


File: _timeits/time_uuids.py
Function: make_uuid_CONTIG_NUMPY_bytes at line 79
Total time: 0.756671 s

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
    79                                           @profile
    80                                           def make_uuid_CONTIG_NUMPY_bytes(gpath):
    81         2          483    241.5      0.1      pil_img = Image.open(gpath, 'r')
    82                                               # Read PIL image data
    83         2       437192 218596.0     57.8      np_img = np.asarray(pil_img)
    84         2        48152  24076.0      6.4      np_flat = np_img.ravel().tostring()
    85         2        49502  24751.0      6.5      np_contig = np.ascontiguousarray(np_flat)
    86         2        49269  24634.5      6.5      img_bytes_ = np_contig.tostring()
    87         2       172072  86036.0     22.7      uuid_ = get_image_uuid(img_bytes_)
    88         2            1      0.5      0.0      return uuid_

Here are the Windows timeit results:

Running: make_uuid_PIL_bytes
timed: 1.4041314945785952 seconds in make_uuid_PIL_bytes
Running: make_uuid_NUMPY_bytes
timed: 1.4475939890251077 seconds in make_uuid_NUMPY_bytes
Running: make_uuid_NUMPY_STRIDE_16_bytes
timed: 1.136886564762671 seconds in make_uuid_NUMPY_STRIDE_16_bytes
Running: make_uuid_NUMPY_STRIDE_64_bytes
timed: 1.0767879228155284 seconds in make_uuid_NUMPY_STRIDE_64_bytes
Running: make_uuid_CONTIG_NUMPY_bytes
timed: 1.5433727380795146 seconds in make_uuid_CONTIG_NUMPY_bytes
Running: make_uuid_CONTIG_NUMPY_STRIDE_16_bytes
timed: 1.0804961515831941 seconds in make_uuid_CONTIG_NUMPY_STRIDE_16_bytes
Running: make_uuid_CONTIG_NUMPY_STRIDE_64_bytes
timed: 1.0577325560451953 seconds in make_uuid_CONTIG_NUMPY_STRIDE_64_bytes

And the linux timeit results:

Running: make_uuid_PIL_bytes
timed: 0.6316661834716797 seconds in make_uuid_PIL_bytes
Running: make_uuid_NUMPY_bytes
timed: 0.666496992111206 seconds in make_uuid_NUMPY_bytes
Running: make_uuid_NUMPY_STRIDE_16_bytes
timed: 0.4908161163330078 seconds in make_uuid_NUMPY_STRIDE_16_bytes
Running: make_uuid_NUMPY_STRIDE_64_bytes
timed: 0.4494049549102783 seconds in make_uuid_NUMPY_STRIDE_64_bytes
Running: make_uuid_CONTIG_NUMPY_bytes
timed: 0.7838680744171143 seconds in make_uuid_CONTIG_NUMPY_bytes
Running: make_uuid_CONTIG_NUMPY_STRIDE_16_bytes
timed: 0.462860107421875 seconds in make_uuid_CONTIG_NUMPY_STRIDE_16_bytes
Running: make_uuid_CONTIG_NUMPY_STRIDE_64_bytes
timed: 0.45322108268737793 seconds in make_uuid_CONTIG_NUMPY_STRIDE_64_bytes

So it does look like the loading of the image is the main culprit (because these images are so big), but the strides to help the hashing a small (but significant) amount.

Still it would be very nice to be able to load only a subset of that data. Does anyone know any way to do this?

È stato utile?

Soluzione

(I'm using Pillow 5.1.0 on Python 3.6.4, on macOS 10.13.3)

I recently had a similar issue while working with images larger than 250MB(!). My use case was slightly different, as I needed actual RGB values, and not bytes, but I found that cropping the image first, and then running getdata() on the cropped area, was much faster for "random access" to a slice of the image. Specifically, on a 30MB image, it's about 28,000 times faster to do img.crop(<x,y,w,h>).getdata() than img.getdata()[<slice>].

>>> t0 = time.time(); x = list(img.getdata())[3336*500:3336*500+3]; t1 = time.time(); print(x, t1-t0)
[(92, 102, 136), (110, 153, 220), (114, 184, 232)] 1.6889581680297852
>>> t0 = time.time(); y = list(img.crop((0, 500, 3, 501)).getdata()); t1 = time.time(); print(y, t1-t0)
[(92, 102, 136), (110, 153, 220), (114, 184, 232)] 5.91278076171875e-05

(1.6 seconds vs. 0.000059 seconds)

Again, this gets you RGB values, not the image byte data, but depending on your needs, this might be acceptable. This also has the side benefit of not requiring numpy, which for me is a plus.

And of course, the logic then depends on how much data you need, and from where, as that might require wrapping around to the next row. That would be ugly, and may not be worth the maintenance/readability cost.

Altri suggerimenti

You can convert the image to a numpy.array, and then use slice notation. You probably will want to first flatten the picture into a single-dimension array, which you can do with array.ravel.

>>> import numpy as np
>>> pixels = np.asarray(pil_img)
>>> pixels.shape
(2592, 1936, 3)
>>> subset = pixels.ravel()[::16] #every 16th byte of pixels.
>>> subset.shape
(940896,)

Notice that the resulting size of the array is equal to (2592 * 1936 * 3) / 16.

Edit

Your comment made me curious, so I went ahead and timed it myself. It turns out that hashlib.sha1 has some additional requirements of the arrays it processes — namely that they be contiguous and in 'C-order' (don't worry about that if it doesn't make sense).

So I ended up having to do the following:

pixels =np.ascontiguousarray(np.asarray(img).ravel()[::16])
hashlib.sha1(pixels)

Anyway, here are the timing results:

In [27]: %timeit hashlib.sha1(img.tobytes())
10 loops, best of 3: 36.3 ms per loop

In [28]: %timeit px =np.ascontiguousarray(np.asarray(img).ravel()[::16]); hashlib.sha1(px)
100 loops, best of 3: 16.9 ms per loop

So it turns out that the numpy array is about twice as fast. But-- it's only using 1/16th of the data. I'm not sure what you're using the hash for, but I might recommend just using the whole image for an extra 20ms.

Autorizzato sotto: CC-BY-SA insieme a attribuzione
Non affiliato a StackOverflow
scroll top