Question

I have the following kernel performing a simple assignment of a global memory matrix in to a global memory matrix out:

__global__ void simple_copy(float *outdata, const float *indata){

    int x = blockIdx.x * TILE_DIM + threadIdx.x;
    int y = blockIdx.y * TILE_DIM + threadIdx.y;

    int width = gridDim.x * TILE_DIM;

    outdata[y*width + x] = indata[y*width + x];

}

I'm inspecting the disassembled microcode dumped by cuobjdump:

Function : _Z11simple_copyPfPKf
/*0000*/     /*0x00005de428004404*/     MOV R1, c [0x1] [0x100]; 
/*0008*/     /*0x80001de218000000*/     MOV32I R0, 0x20;            R0 = TILE_DIM
/*0010*/     /*0x00001c8614000000*/     LDC R0, c [0x0] [R0];       R0 = c
/*0018*/     /*0x90009de218000000*/     MOV32I R2, 0x24;            R2 = 36
/*0020*/     /*0x00209c8614000000*/     LDC R2, c [0x0] [R2];       R2 = c

int x = blockIdx.x * TILE_DIM + threadIdx.x;
/*0028*/     /*0x9400dc042c000000*/     S2R R3, SR_CTAid_X;         R3 = BlockIdx.x
/*0030*/     /*0x0c00dde428000000*/     MOV R3, R3;                 R3 = R3 ???
/*0038*/     /*0x84011c042c000000*/     S2R R4, SR_Tid_X;           R3 = ThreadIdx.x
/*0040*/     /*0x10011de428000000*/     MOV R4, R4;                 R4 = R4 ???
/*0048*/     /*0x8030dca32008c000*/     IMAD R3, R3, 0x20, R4;      R3 = R3 * TILE_DIM + R4  (contains x)

int y = blockIdx.y * TILE_DIM + threadIdx.y;
/*0050*/     /*0x98011c042c000000*/     S2R R4, SR_CTAid_Y;
/*0058*/     /*0x10011de428000000*/     MOV R4, R4;
/*0060*/     /*0x88015c042c000000*/     S2R R5, SR_Tid_Y;
/*0068*/     /*0x14015de428000000*/     MOV R5, R5;
/*0070*/     /*0x80411ca3200ac000*/     IMAD R4, R4, 0x20, R5;      R4 ...                   (contains y)

int width = gridDim.x * TILE_DIM;
/*0078*/     /*0x50015de428004000*/     MOV R5, c [0x0] [0x14];     R5 = c
/*0080*/     /*0x80515ca35000c000*/     IMUL R5, R5, 0x20;          R5 = R5 * TILE_DIM       (contains width)   

y*width + x
/*0088*/     /*0x14419ca320060000*/     IMAD R6, R4, R5, R3;        R6 = R4 * R5 + R3        (contains y*width+x)

Loads indata[y*width + x]
/*0090*/     /*0x08619c036000c000*/     SHL R6, R6, 0x2;            
/*0098*/     /*0x18209c0348000000*/     IADD R2, R2, R6;            
/*00a0*/     /*0x08009de428000000*/     MOV R2, R2;                 R2 = R2 ???
/*00a8*/     /*0x00209c8580000000*/     LD R2, [R2];                Load from memory - R2 = 

Stores outdata[y*width + x]
/*00b0*/     /*0x1440dca320060000*/     IMAD R3, R4, R5, R3;        
/*00b8*/     /*0x0830dc036000c000*/     SHL R3, R3, 0x2;
/*00c0*/     /*0x0c001c0348000000*/     IADD R0, R0, R3;            R0 = R0 + R3
/*00c8*/     /*0x00001de428000000*/     MOV R0, R0;                 R0 = R0 ???
/*00d0*/     /*0x00009c8590000000*/     ST [R0], R2;                Store to memory

/*00d8*/     /*0x40001de740000000*/     BRA 0xf0;
/*00e0*/     /*0x00001de780000000*/     EXIT;
/*00e8*/     /*0x00001de780000000*/     EXIT;
/*00f0*/     /*0x00001de780000000*/     EXIT;
/*00f8*/     /*0x00001de780000000*/     EXIT;

The comments on top or aside of the disassembled code are my own.

As you can see, there are some apparently useless operations, marked by ??? in the comments. Essentially, they are moves of registers into themselves.

I have then the two following questions:

  1. If they are useless, I believe that they are uselessly consuming computation time. Can I optimize the disassembled microcode by removing them?
  2. PTX files can be inlined in CUDA codes. However, PTX is just an intermediate language needed for portability across GPUs. Can I somehow "inline" an optimized disassembled microcode?

Thank you very much in advance.

EDIT: THE SAME CODE COMPILED IN RELEASE MODE FOR SM = 2.0

Function : _Z11simple_copyPfPKf
.headerflags    @"EF_CUDA_SM20 EF_CUDA_PTX_SM(EF_CUDA_SM20)"
/*0000*/        MOV R1, c[0x1][0x100];            /* 0x2800440400005de4 */
/*0008*/        S2R R0, SR_CTAID.Y;               /* 0x2c00000098001c04 */
/*0010*/        S2R R2, SR_TID.Y;                 /* 0x2c00000088009c04 */
/*0018*/        S2R R3, SR_CTAID.X;               /* 0x2c0000009400dc04 */
/*0020*/        S2R R4, SR_TID.X;                 /* 0x2c00000084011c04 */
/*0028*/        MOV R5, c[0x0][0x14];             /* 0x2800400050015de4 */
/*0030*/        ISCADD R2, R0, R2, 0x5;           /* 0x4000000008009ca3 */
/*0038*/        ISCADD R3, R3, R4, 0x5;           /* 0x400000001030dca3 */
/*0040*/        SHL R0, R5, 0x5;                  /* 0x6000c00014501c03 */
/*0048*/        IMAD R2, R0, R2, R3;              /* 0x2006000008009ca3 */
/*0050*/        ISCADD R0, R2, c[0x0][0x24], 0x2; /* 0x4000400090201c43 */
/*0058*/        ISCADD R2, R2, c[0x0][0x20], 0x2; /* 0x4000400080209c43 */
/*0060*/        LD R0, [R0];                      /* 0x8000000000001c85 */
/*0068*/        ST [R2], R0;                      /* 0x9000000000201c85 */
/*0070*/        EXIT ;                            /* 0x8000000000001de7 */

EDIT: THE SAME CODE COMPILED IN RELEASE MODE FOR SM = 2.1

Function : _Z11simple_copyPfPKf
.headerflags    @"EF_CUDA_SM20 EF_CUDA_PTX_SM(EF_CUDA_SM20)"
/*0000*/        MOV R1, c[0x1][0x100];            /* 0x2800440400005de4 */
/*0008*/        NOP;                              /* 0x4000000000001de4 */
/*0010*/        MOV R0, c[0x0][0x14];             /* 0x2800400050001de4 */
/*0018*/        S2R R2, SR_CTAID.Y;               /* 0x2c00000098009c04 */
/*0020*/        SHL R0, R0, 0x5;                  /* 0x6000c00014001c03 */
/*0028*/        S2R R3, SR_TID.Y;                 /* 0x2c0000008800dc04 */
/*0030*/        ISCADD R3, R2, R3, 0x5;           /* 0x400000000c20dca3 */
/*0038*/        S2R R4, SR_CTAID.X;               /* 0x2c00000094011c04 */
/*0040*/        S2R R5, SR_TID.X;                 /* 0x2c00000084015c04 */
/*0048*/        ISCADD R2, R4, R5, 0x5;           /* 0x4000000014409ca3 */
/*0050*/        IMAD R2, R0, R3, R2;              /* 0x200400000c009ca3 */
/*0058*/        ISCADD R0, R2, c[0x0][0x24], 0x2; /* 0x4000400090201c43 */
/*0060*/        ISCADD R2, R2, c[0x0][0x20], 0x2; /* 0x4000400080209c43 */
/*0068*/        LD R0, [R0];                      /* 0x8000000000001c85 */
/*0070*/        ST [R2], R0;                      /* 0x9000000000201c85 */
/*0078*/        EXIT ;                            /* 0x8000000000001de7 */
Was it helpful?

Solution

The answer to both questions is no.

If you try to delete instructions from the final binary payload. you will change the length of code sections and break the ELF and fatbinary files. To fix that would require hand crafting headers whose formats are not readily documented, which sounds like a lot of work just to optimize out a couple of instructions.

And inline native assembler is not supported, but I am sure you knew that already.

And finally, I can't reproduce using CUDA 5.0:

Fatbin elf code:
================
arch = sm_20
code version = [1,6]
producer = cuda
host = mac
compile_size = 32bit
identifier = pumpkinhead.cu

    code for sm_20
        Function : _Z11simple_copyPfPKf
    /*0000*/     /*0x00005de428004404*/     MOV R1, c [0x1] [0x100];
    /*0008*/     /*0x98001c042c000000*/     S2R R0, SR_CTAid_Y;
    /*0010*/     /*0x88009c042c000000*/     S2R R2, SR_Tid_Y;
    /*0018*/     /*0x9400dc042c000000*/     S2R R3, SR_CTAid_X;
    /*0020*/     /*0x84011c042c000000*/     S2R R4, SR_Tid_X;
    /*0028*/     /*0x08001ca340000000*/     ISCADD R0, R0, R2, 0x5;
    /*0030*/     /*0x10309ca340000000*/     ISCADD R2, R3, R4, 0x5;
    /*0038*/     /*0x50001ca350004000*/     IMUL R0, R0, c [0x0] [0x14];
    /*0040*/     /*0x08009ca340000000*/     ISCADD R2, R0, R2, 0x5;
    /*0048*/     /*0x90201c4340004000*/     ISCADD R0, R2, c [0x0] [0x24], 0x2;
    /*0050*/     /*0x80209c4340004000*/     ISCADD R2, R2, c [0x0] [0x20], 0x2;
    /*0058*/     /*0x00001c8580000000*/     LD R0, [R0];
    /*0060*/     /*0x00201c8590000000*/     ST [R2], R0;
    /*0068*/     /*0x00001de780000000*/     EXIT;
        .....................................

Are you sure the code you have shown was compiled with release settings?

Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top