Question

(Dependencies for this program: vector --any and JuicyPixels >= 2. Code is available as Gist.)

{-# LANGUAGE Haskell2010 #-}
{-# LANGUAGE BangPatterns #-}

import Control.Arrow
import Data.Bits
import Data.Vector.Unboxed ((!))
import Data.Word
import System.Environment (getArgs)

import qualified Codec.Picture as P
import qualified Data.ByteString as B
import qualified Data.Vector.Unboxed as V

I tried to port Ken Perlin's improved noise to Haskell, but I'm not entirely sure that my method is correct. The main part is something that should generalize nicely to higher and lower dimensions, but that is something for later:

perlin3 :: (Ord a, Num a, RealFrac a, V.Unbox a) => Permutation -> (a, a, a) -> a
perlin3 p (!x', !y', !z')
  = let (!xX, !x) = actuallyProperFraction x'
        (!yY, !y) = actuallyProperFraction y'
        (!zZ, !z) = actuallyProperFraction z'

        !u = fade x
        !v = fade y
        !w = fade z

        !h = xX
        !a = next p h + yY
        !b = next p (h+1) + yY
        !aa = next p a + zZ
        !ab = next p (a+1) + zZ
        !ba = next p b + zZ
        !bb = next p (b+1) + zZ
        !aaa = next p aa
        !aab = next p (aa+1)
        !aba = next p ab
        !abb = next p (ab+1)
        !baa = next p ba
        !bab = next p (ba+1)
        !bba = next p bb
        !bbb = next p (bb+1)

    in
        lerp w
            (lerp v
                (lerp u
                    (grad aaa (x, y, z))
                    (grad baa (x-1, y, z)))
                (lerp u
                    (grad aba (x, y-1, z))
                    (grad bba (x-1, y-1, z))))
            (lerp v
                (lerp u
                    (grad aab (x, y, z-1))
                    (grad bab (x-1, y, z-1)))
                (lerp u
                    (grad abb (x, y-1, z-1))
                    (grad bbb (x-1, y-1, z-1))))

This is of course accompanied by a few functions mentioned in the perlin3 function, of which I hope they are as efficient as possible:

fade :: (Ord a, Num a) => a -> a
fade !t | 0 <= t, t <= 1 = t * t * t * (t * (t * 6 - 15) + 10)

lerp :: (Ord a, Num a) => a -> a -> a -> a
lerp !t !a !b | 0 <= t, t <= 1 = a + t * (b - a)

grad :: (Bits hash, Integral hash, Num a, V.Unbox a) => hash -> (a, a, a) -> a
grad !hash (!x, !y, !z) = dot3 (vks `V.unsafeIndex` fromIntegral (hash .&. 15)) (x, y, z)
  where
    vks = V.fromList
        [ (1,1,0), (-1,1,0), (1,-1,0), (-1,-1,0)
        , (1,0,1), (-1,0,1), (1,0,-1), (-1,0,-1)
        , (0,1,1), (0,-1,1), (0,1,-1), (0,-1,-1)
        , (1,1,0), (-1,1,0), (0,-1,1), (0,-1,-1)
        ]

dot3 :: Num a => (a, a, a) -> (a, a, a) -> a
dot3 (!x0, !y0, !z0) (!x1, !y1, !z1) = x0 * x1 + y0 * y1 + z0 * z1

-- Unlike `properFraction`, `actuallyProperFraction` rounds as intended.
actuallyProperFraction :: (RealFrac a, Integral b) => a -> (b, a)
actuallyProperFraction x
  = let (ipart, fpart) = properFraction x
        r = if x >= 0 then (ipart, fpart)
                      else (ipart-1, 1+fpart)
    in r

For the permutation group, I just copied the one Perlin used on his website:

newtype Permutation = Permutation (V.Vector Word8)

mkPermutation :: [Word8] -> Permutation
mkPermutation xs
    | length xs >= 256
    = Permutation . V.fromList $ xs

permutation :: Permutation
permutation = mkPermutation
    [151,160,137,91,90,15,
   131,13,201,95,96,53,194,233,7,225,140,36,103,30,69,142,8,99,37,240,21,10,23,
   190, 6,148,247,120,234,75,0,26,197,62,94,252,219,203,117,35,11,32,57,177,33,
   88,237,149,56,87,174,20,125,136,171,168, 68,175,74,165,71,134,139,48,27,166,
   77,146,158,231,83,111,229,122,60,211,133,230,220,105,92,41,55,46,245,40,244,
   102,143,54, 65,25,63,161, 1,216,80,73,209,76,132,187,208, 89,18,169,200,196,
   135,130,116,188,159,86,164,100,109,198,173,186, 3,64,52,217,226,250,124,123,
   5,202,38,147,118,126,255,82,85,212,207,206,59,227,47,16,58,17,182,189,28,42,
   223,183,170,213,119,248,152, 2,44,154,163, 70,221,153,101,155,167, 43,172,9,
   129,22,39,253, 19,98,108,110,79,113,224,232,178,185, 112,104,218,246,97,228,
   251,34,242,193,238,210,144,12,191,179,162,241, 81,51,145,235,249,14,239,107,
   49,192,214, 31,181,199,106,157,184, 84,204,176,115,121,50,45,127, 4,150,254,
   138,236,205,93,222,114,67,29,24,72,243,141,128,195,78,66,215,61,156,180
   ]

next :: Permutation -> Word8 -> Word8
next (Permutation !v) !idx'
  = v `V.unsafeIndex` (fromIntegral $ idx' .&. 0xFF)

And all this is tied together with JuicyPixels:

main = do
    [target] <- getArgs
    let image = P.generateImage pixelRenderer 512 512
    P.writePng target image
  where
    pixelRenderer, pixelRenderer' :: Int -> Int -> Word8
    pixelRenderer !x !y
        = floor $ ((perlin3 permutation ((fromIntegral x - 256) / 32,
          (fromIntegral y - 256) / 32, 0 :: Double))+1)/2 * 128

    -- This code is much more readable, but also much slower.
    pixelRenderer' x y
        = (\w -> floor $ ((w+1)/2 * 128)) -- w should be in [-1,+1]
        . perlin3 permutation
        . (\(x,y,z) -> ((x-256)/32, (y-256)/32, (z-256)/32))
        $ (fromIntegral x, fromIntegral y, 0 :: Double)

My problem is that perlin3 seems very slow to me. If I profile it, pixelRenderer is getting a lot of time as well, but I'll ignore that for now. I don't know how to optimize perlin3. I tried to hint GHC with bang patterns, which cuts the execution time in half, so that's nice. Explicitly specializing and inlining barely helps with ghc -O. Is perlin3 supposed to be this slow?


UPDATE: an earlier version of this question mentioned a bug in my code. This problem has been resolved; it turns out my old version of actuallyProperFraction was buggy. It implicitly rounded the integral part of a floating point number to Word8, and then subtracted it from the floating point number to get the fractional part. Since Word8 can only take values between 0 and 255 inclusive, this won't work properly for numbers outside that range, including negative numbers.

Was it helpful?

Solution

This code appears to be mostly computation-bound. It can be improved a little bit, but not by much unless there's a way to use fewer array lookups and less arithmetic.

There are two useful tools for measuring performance: profiling and code dumps. I added an SCC annotation to perlin3 so that it would show up in the profile. Then I compiled with gcc -O2 -fforce-recomp -ddump-simpl -prof -auto. The -ddump-simpl flag prints the simplified code.

Profiling: On my computer, it takes 0.60 seconds to run the program, and about 20% of execution time (0.12 seconds) is spent in perlin3 according to the profile. Note that the precision of my profile info is about +/-3%.

Simplifier output: The simplifier produces fairly clean code. perlin3 gets inlined into pixelRenderer, so that's the part of the output you want to look at. Most of the code consists of unboxed array reads and unboxed arithmetic. To improve performance, we want to eliminate some of this arithmetic.

An easy change is to eliminate the run-time checks on SomeFraction (which doesn't appear in your question, but is part of the code that you uploaded). This reduces the program's execution time to 0.56 seconds.

-- someFraction t | 0 <= t, t < 1 = SomeFraction t
someFraction t = SomeFraction t

Next, there are several array lookups that show up in the simplifier like this:

                 case GHC.Prim.indexWord8Array#
                        ipv3_s23a
                        (GHC.Prim.+#
                           ipv1_s21N
                           (GHC.Prim.word2Int#
                              (GHC.Prim.and#
                                 (GHC.Prim.narrow8Word#
                                    (GHC.Prim.plusWord# ipv5_s256 (__word 1)))
                                 (__word 255))))

The primitive operation narrow8Word# is for coercing from an Int to a Word8. We can get rid of this coercion by using Int instead of Word8 in the definition of next.

next :: Permutation -> Int -> Int
next (Permutation !v) !idx'
  = fromIntegral $ v `V.unsafeIndex` (fromIntegral idx' .&. 0xFF)

This reduces the program's execution time to 0.54 seconds. Considering just the time spent in perlin3, the execution time has fallen (roughly) from 0.12 to 0.06 seconds. Although it's hard to measure where the rest of the time is going, it's most likely spread out among the remaining arithmetic and array accesses.

OTHER TIPS

On my machine reference code with Heatsink's optimisations takes 0.19 secs.

Firstly, I has moved from JuicyPixels to yarr and yarr-image-io with my favourite flags, -Odph -rtsopts -threaded -fno-liberate-case -funbox-strict-fields -fexpose-all-unfoldings -funfolding-keeness-factor1000 -fsimpl-tick-factor=500 -fllvm -optlo-O3 (they are given here):

import Data.Yarr as Y
import Data.Yarr.IO.Image as Y
...

main = do
    [target] <- getArgs
    image <- dComputeS $ fromFunction (512, 512) (return . pixelRenderer)
    Y.writeImage target (Grey image)
  where
    pixelRenderer, pixelRenderer' :: Dim2 -> Word8
    pixelRenderer (y, x)
        = floor $ ((perlin3 permutation ((fromIntegral x - 256) / 32,
          (fromIntegral y - 256) / 32, 0 :: Double))+1)/2 * 128

    -- This code is much more readable, but also much slower.
    pixelRenderer' (y, x)
        = (\w -> floor $ ((w+1)/2 * 128)) -- w should be in [-1,+1]
        . perlin3 permutation
        . (\(x,y,z) -> ((x-256)/32, (y-256)/32, (z-256)/32))
        $ (fromIntegral x, fromIntegral y, 0 :: Double)

This makes the program 30% faster, 0.13 seconds.

Secondly I has replaced uses of standard floor with

doubleToByte :: Double -> Word8
doubleToByte f = fromIntegral (truncate f :: Int)

It is known issue (google "haskell floor performance"). Execution time is reduced to 52 ms (0.052 secs), in almost 3 times.

Finally, just for fun I tried to compute noise in parallel (dComputeP instead of dComputeS and +RTS -N4 in command line run). Program took 36 ms, including I/O constant of about 10 ms.

Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top