代码之家  ›  专栏  ›  技术社区  ›  glc78

装配中的矩阵乘法

  •  1
  • glc78  · 技术社区  · 8 年前

            unsigned int m = 3; // raws of mat1
            unsigned int n = 2; // columns of mat1
            unsigned int k = 4; // columns of mat2
            short int mat1[] = { -1,-2, 4,5, 4,-2 }; // first matrix
            short int mat2[] = { 2,0,4,6, 0,2,-1,3 }; // second matrix
            int mat3[1024]; // output matrix
    
            __asm {
    
                XOR EAX, EAX    //mat1 raws counter
                XOR EBX, EBX    //mat2 columns counter
                XOR EDX, EDX    //mat1 columns(equal to mat2 raws) counter
                XOR EDI, EDI    //will contain sum of multiplications to be copied into output matrix
    
                Loop1 :         //determinates the raws of output matrix: mat3
                XOR EBX, EBX    //at the end of first raw, column counter is resetted
                    CMP m, EAX  //if loopped mat1 m-raws times...       
                    JZ Finale   //...algortihm is over
                    INC EAX     //increase mat1 raws counter 
                    JMP Loop2
    
                Loop2 :             //determinates the columns of mat3
                XOR EDX, EDX        //at the end of the n-sums, mat1 column counter is resetted
                    XOR EDI, EDI    //after sum of n-multiplications edi is resetted
                    CMP k, EBX      //if multiplications/sums on this raw have been done...
                    JZ Loop1        //...go to next output matrix raw
                    INC EBX         //increase mat2 columns counter
                    JMP Loop3
    
                Loop3 :         //determinates elements of mat3
                CMP n, EDX      //if the n-multiplacations/sums on first n-elements have been done...
                    JZ Loop2    //...skip to next n-elements
                    INC EDX     //increase counter of the elements that will be multiplicate
                    JMP Stuffs  //go to operations code block
    
                Stuffs :                                        //here code generates mat3 elements
        #58     MOV SI, mat1[2 * ((EAX - 1) * 2 + (EDX - 1)]    //moves to SI the [m-raws/n-clomumn] element of mat1
        #59         IMUL SI, mat2[2 * ((EBX - 1) * 2 + (EDX - 1)]   //multiplicates(with sign) SI and [n-raws/k-column] element of mat2
                    ADD DI, SI                                  //sum the result in edi
                    CMP n, EDX                                  //check the sums
                    JZ CopyResult                               //if n-sums have been done...
                    JMP Loop3                                   //...go to copy result into mat3
    
                CopyResult :
        #66     MOV mat3[4 * ((EAX - 1) * 4 + (EBX - 1))], EDI  //copy value to output matrix mat3
                    JMP Loop3                                   //go to next n-elements
    
                Finale :
        }
    
        {
            unsigned int i, j, h;
    
            printf("Output matrix:\n");
            for (i = h = 0; i < m; i++) {
                for (j = 0; j < k; j++, h++)
                    printf("%6d ", mat3[h]);
                printf("\n");
            }
    
        }
    

    在这段代码中,编译器针对mat1、mat2和mat3报告两种类型的引用IMUL和MOV的错误。如下所示:

    • 第58行-错误C2404“EDX”:“第二个操作数”中的寄存器非法
    • 第58行-错误C2430“第二个操作数”中有多个索引寄存器

    第59行和第66行的错误与EDX和EBX寄存器相同。

    这个算法基本上好吗?(我测试了一些手动设置 索引,然后是最后一个,在调试期间,它很好,但我无法完全测试它)。

    我认为第一个错误取决于第二个错误,但如果我 不能这样使用寄存器,我可以做什么来计算输出?

    2 回复  |  直到 8 年前
        1
  •  4
  •   Community CDub    8 年前

    而不是在寻址模式下尝试将多个寄存器扩展为两个( which is impossible ),只需使用 add eax, 2 而不是 inc eax .

    此外,由于输出矩阵使用32位 int ,你应该做32位数学。您将在DI中生成一个值,然后用第66行存储该值加上EDI高半部分中的任何垃圾。

    有点像 movsx esi, word ptr [rowstart + column]
    movsx eax, word ptr [offset_in_column + row] / imul eax, esi


    基于我认为你正在尝试做的,我认为你的算法可能是健全的。对于输出矩阵的每个元素,循环遍历一个矩阵中的列和另一个矩阵的行。因此,您只对输出矩阵的每个元素存储一次。不管你的循环是否真的这样做,IDK:分支有多么难看,这会伤到我的大脑 http://gcc.godbolt.org/ ).


    嵌套循环的其他方法对于大型矩阵的性能可能更好或更差,但唯一真正好的方法(对于大型矩阵)是转置其中一个输入矩阵,以便可以一次循环两个矩阵中的连续内存元素(因为转置花费O(n^2)时间,但会加快重复遍历转置数组的O(n*3)步骤,因为它提供了更多缓存命中)。

        2
  •  2
  •   glc78    8 年前

    谢谢大家。这是我最后的三嵌套循环和矩阵乘积代码。对于我测试的内容,它适用于任何矩阵和正负值:

    void main()
    {
        unsigned int m = 3; // numero di righe della prima matrice
        unsigned int n = 2; // numero di colonne della prima matrice
        unsigned int k = 4; // numero di colonne della seconda matrice
        short int mat1[] = { -1,-2, 4,5, 4,-2 }; // prima matrice
        short int mat2[] = { 2,0,0,0, 0,2,0,0 }; // seconda matrice
        int mat3[1024]; // matrice risultato
    
        __asm {
    
            lea eax, mat1       //load mat1
            lea edi, mat3       //load mat result
            push m
    
            Loop3 :             //extern loop
            lea ebx, mat2       //load here mat2 to start from the beginning when new result raw starts
                xor edx, edx    //sets to zero column counter set to zero when new result row starts
    
            Loop2 :             //middle loop, as long as k, mat2 columns 
            xor ecx, ecx        //sets to zero mat1 column counter every n multiplications
    
            Loop1 :             //inner loop
            call Compute        //calls sub program that calulates raw/column products
                inc ecx         //increase column counter
                cmp ecx, n      //check column counter
                jb Loop1        //if below loop1 again
                add ebx, 2      //if equal to n, inner loop is over, move mat2 position of one position
                inc edx         //increase mat2 column counter
                cmp edx, k      //chek mat2 column counter
                jb Loop2        //if below loop2 again
                imul esi, n, 2      //else calculate offset to skip to new raw in mat1
                add eax, esi        //...skip to new mat1 raw
                imul esi, k, 4      //calculate offset to skip to new raw in result matrix(mat3)
                add edi, esi        //...skip to new raw in mat3
                dec m               //a raw in mat1 has been done, decrease its counter
                cmp m, 0            //check how many raws has been done
                ja Loop3            //if more than zero, do extern loop again
                jmp Finale          //else algorithm is over
    
            Compute :               //calulates raw/column products
            movsx esi, WORD PTR[eax][ecx * 2]       
                push edi            //stores mat3 address to free edi counter
                push ecx            //stores the actual value of column counter to free ecx register
                mov edi, k          //calculates the offset in mat2...
                imul edi, ecx       //...
                movsx ecx, WORD PTR[ebx][edi * 2]   //mov the value of mat2 to ecx
                imul esi, ecx       //multiplicates values of mat1 ad mat2
                pop ecx             //set back column counter
                pop edi             //set back mat3 address
                cmp ecx, 0          //if ecx is zero...
                je First            //...is the first multiplication for this result value...
                add[edi][edx * 4], esi  //if not the first, add the value to current position
    
            Back :
            ret                     //in any case, comes back to loops...
    
            First :                 //...so move here the first value to which add the others
            mov[edi][edx * 4], esi  //moving value
                jmp Back
    
            Finale :        //the end
            pop m           //restore the original mat1 raw value to print the result matrix below
    
        }
    
        //Output on video
    
        {
            unsigned int i, j, h;
    
            printf("Product Matrix:\n");
            for (i = h = 0; i < m; i++) {
                for (j = 0; j < k; j++, h++)
                    printf("%6d ", mat3[h]);
                printf("\n");
            }
    
        }
    }
    

    这只是我从stackoverflaw中的双嵌套循环开始编写的三嵌套循环的算法:

    m = raws of first matrix
    k = columns of second matrix
    n = column of first matrix and raws of second matrix
    
    x=0
    Loop3:
    y=0
    Loop2:
    z=0
    Loop1:
    compute...
    z++
    if(z<n)
       go to Loop1
    y++
    if(y < k)
       go to Loop2
    x++
    if(x < m)
       go to Loop3
    Else go to the End