// DOTLOOP           Scalar Product of N-vectors

// This program will compute the scalar product
// of two multielement vectors V and W.
        N       = 3               // N = dimensionality
        .data                     // Declare storage
        .align  8                 // Desired alignment
P:      .skip   8                 // Space for product
V:      data2   -1,+3,+5          // V1, V2, V3, etc.
W:      data2   -2,-4,+6          // W1, W2, W3, etc.
        .text                     // Section for code
        .align  32                // Desired alignment
        .global main              // These three lines
        .proc   main              //  mark the mandatory
main:                             //   'main' program entry
        .body                     // Now we really begin...
first:  movl    r14 = V;;         // Pointer for V
        movl    r15 = W;;         // Pointer for W
        movl    r16 = P;;         // Pointer for P
        mov     r17 = N;;         // Number of V components
        mov     r20 = 0;;         // R20 = running sum
top:    ld2     r21 = [r14],2;;   // Get Vi; bump pointer
        ld2     r22 = [r15],2;;   // Get Wi; bump pointer
        pmpy2.r r21 = r21,r22;;   // Compute Vi times Wi
        sxt4    r21 = r21;;       // Extend 32 bits to 64
        add     r20 = r21,r20;;   // Update the sum
        add     r17 = -1,r17;;    // Decrement loop count
        cmp.gt  p6,p0 = r17,r0    // More to do?
        (p6) br.cond.sptk.few top;; // Yes
        st8     [r16] = r20;;     // No, store the product
done:   mov     r8 = 0;;          // Signal all is normal
        br.ret.sptk.many b0;;     // Back to command line
        .endp   main              // Mark end of procedure