Question

The following is the rootbeer example code for Nvidia CUDA that I ran on a laptop with Ubuntu 12.04 (Precise) with bumblebee and optirun. The laptop features Nvidia Optimus, hence the optirun. The GPU happens to be a Nvidia GeForce GT 540M which the Nvidia website says has 96 cores. I get almost no throughput gain. What is the problem?

package com.random.test;

import java.util.ArrayList;
import java.util.Formatter;
import java.util.List;

import edu.syr.pcpratts.rootbeer.runtime.Kernel;
import edu.syr.pcpratts.rootbeer.runtime.Rootbeer;

public class ArraySumApp {
    final static int numberOfJobs = 1024; // 1024 in the original example
    final static int sizeOfArray = 512; // 512 in the original example
    final static int theAnswer = 130816;

    public int[] sumArrays(List<int[]> arrays) {

        List<Kernel> jobs = new ArrayList<Kernel>();
        int[] ret = new int[arrays.size()];
        for (int i = 0; i < arrays.size(); ++i) {
            jobs.add(new ArraySum(arrays.get(i), ret, i));
        }

        Rootbeer rootbeer = new Rootbeer();
        rootbeer.runAll(jobs);
        return ret;
    }

    private static long measureOneJob() {

        int[] source = new int[ArraySumApp.sizeOfArray];
        int[] destination = new int[1];
        for (int i = 0; i < ArraySumApp.sizeOfArray; i++)
            source[i] = i;
        Kernel job = new ArraySum(source, destination, 0);

        ElapsedTimer et = new ElapsedTimer();
        job.gpuMethod();
        long timeInMs = et.stopInMilliseconds();
        System.out.println("measureOneJob " + et.stringInMilliseconds());

        assert destination[0] == ArraySumApp.theAnswer : "cosmic rays";
        return timeInMs;
    }

    public static void main(String[] args) {

        Helper.assertAssertionEnabled();

        // measure the time to do one job
        ArraySumApp.measureOneJob();
        long oneJob = ArraySumApp.measureOneJob();

        ArraySumApp app = new ArraySumApp();
        List<int[]> arrays = new ArrayList<int[]>();

        // you want 1000s of threads to run on the GPU all at once for speedups
        for (int i = 0; i < ArraySumApp.numberOfJobs; ++i) {
            int[] array = new int[ArraySumApp.sizeOfArray];
            for (int j = 0; j < array.length; ++j) {
                array[j] = j;
            }
            arrays.add(array);
        }

        ElapsedTimer et = new ElapsedTimer();
        int[] sums = app.sumArrays(arrays);
        long allJobs = et.stopInMilliseconds();
        System.out.println("measureAllJobs " + et.stringInMilliseconds());

        double gainFactor = ((double) ArraySumApp.numberOfJobs) * oneJob
                / allJobs;
        System.out.println(String.format(
                "throughput gain factor %.1f\nthroughput gain %.1f\n",
                gainFactor, gainFactor - 1.0d));

        // check the number of answers is correct
        assert sums.length == ArraySumApp.numberOfJobs : "cosmic rays";

        // check they all have the answer
        for (int i = 0; i < ArraySumApp.numberOfJobs; i++)
            assert sums[i] == ArraySumApp.theAnswer : "cosmic rays";
    }
}

class ArraySum implements Kernel {

    final static int repetitionFactor = 100000;

    private int[] source;
    private int[] ret;
    private int index;

    public ArraySum(int[] src, int[] dst, int i) {
        source = src;
        ret = dst;
        index = i;
    }

    public void gpuMethod() {
        for (int repetition = 0; repetition < ArraySum.repetitionFactor; repetition++) {
            int sum = 0;
            for (int i = 0; i < source.length; ++i) {
                sum += source[i];
            }
            ret[index] = sum;
        }
    }
}

class Helper {
    private Helper() {
    }

    static void assertAssertionEnabled() {
        try {
            assert false;
        } catch (AssertionError e) {
            return;
        }
        Helper.noteCosmicRays();
    }

    static void noteCosmicRays() // programmer design or logic error
    {
        throw new RuntimeException("cosmic rays");
    }
}

class ElapsedTimer {
    private org.joda.time.DateTime t0;
    private long savedStopInMilliseconds;

    public ElapsedTimer() {
        this.t0 = new org.joda.time.DateTime();
    }

    public long stopInMilliseconds() {
        return stop();
    }

    public String stringInMilliseconds() // relies on a saved stop
    {
        Formatter f = new Formatter();
        f.format("%d ms", this.savedStopInMilliseconds);
        String s = f.toString();
        f.close();
        return s;
    }

    public String stopStringInMilliseconds() {
        stop();
        return stringInMilliseconds();
    }

    public String stringInSecondsAndMilliseconds() // relies on a saved stop
    {
        Formatter f = new Formatter();
        f.format("%5.3f s", this.savedStopInMilliseconds / 1000.0d);
        String s = f.toString();
        f.close();
        return s;
    }

    public String stopStringInSecondsAndMilliseconds() {
        stop();
        return stringInSecondsAndMilliseconds();
    }

    public long stopInSeconds() {
        return (stop() + 500L) / 1000L; // rounding
    }

    public String stringInSeconds() // relies on a saved stop
    {
        Formatter f = new Formatter();
        long elapsed = (this.savedStopInMilliseconds + 500L) / 1000L; // rounding
        f.format("%d s", elapsed);
        String s = f.toString();
        f.close();
        return s;
    }

    public String stopStringInSeconds() {
        stop();
        return stringInSeconds();
    }

    /**
     * This is private. Use the stopInMilliseconds method if this is what you
     * need.
     */
    private long stop() {
        org.joda.time.DateTime t1 = new org.joda.time.DateTime();
        savedStopInMilliseconds = t1.getMillis() - this.t0.getMillis();
        return savedStopInMilliseconds;
    }
}

This is the output:

measureOneJob 110 ms
measureOneJob 26 ms
CudaRuntime2 ctor: elapsedTimeMillis: 609
measureAllJobs 24341 ms
throughput gain factor 1.1
throughput gain 0.1
Was it helpful?

Solution

The rootbeer developer said the example code that takes the sum of array elements is not the best example and an alternative example would show throughput gains.

OTHER TIPS

You can see: https://github.com/pcpratts/rootbeer1/tree/develop/gtc2013/Matrix

This is an example for the 2013 NVIDIA GTC conference. I obtained a 20x speedup over a 4-core Java Matrix Multiply that uses transpose.

The example is a tiled Matrix Multiply using shared memory on the GPU. From the NVIDIA literature, using shared memory is one of the most important apsects of getting good speedups. To use shared memory you have each thread in a block load values into a shared array. Then you have to reuse these shared values several times. This saves the time to fetch from global memory.

A fetch from global memory takes about 200-300 clock cycles and a fetch from shared memory takes about 2-3 clock cycles on the Tesla 2.0 archicture.

Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top