// DOTCLOOP           Scalar Product of N-vectors

// This program will compute the scalar product
// of two multielement vectors V and W.
        N       = 3               // N = dimensionality
        .data                     // Declare storage
        .align  8                 // Desired alignment
P:      .skip   8                 // Space for product
V:      data2   -1,+3,+5          // V1, V2, V3, etc.
W:      data2   -2,-4,+6          // W1, W2, W3, etc.
        .text                     // Section for code
        .align  32                // Desired alignment
        .global main              // These three lines
        .proc   main              //  mark the mandatory
main:                             //   'main' program entry
        .prologue                 // Leaf procedure can save
        .save   ar.lc, r9         //  the caller's ar.lc
        mov     r9 = ar.lc;;      //   in a scratch register
        .body                     // Now we really begin...
first:  movl    r14 = V;;         // Pointer for V
        movl    r15 = W;;         // Pointer for W
        movl    r16 = P;;         // Pointer for P
        mov     r20 = 0           // R20 = running sum
        mov     r17 = N-1;;       // Number of traversals
        mov     ar.lc = r17       //  minus one
top:    ld2     r21 = [r14],2     // Get Vi; bump pointer
        ld2     r22 = [r15],2;;   // Get Wi; bump pointer
        pmpy2.r r21 = r21,r22;;   // Compute Vi times Wi
        sxt4    r21 = r21;;       // Extend 32 bits to 64
        add     r20 = r21,r20     // Update the sum
        br.cloop.sptk.few top;;   // More to do?
        st8     [r16] = r20;;     // No, store the product
done:   mov     r8 = 0;;          // Signal all is normal
        mov     ar.lc = r9;;      // Restore caller's ar.lc
        br.ret.sptk.many b0;;     // Back to command line
        .endp   main              // Mark end of procedure