// BOOTH      64 x 64 --> 128 bit multiplication

// This function multiplies signed in0 times signed in1,
// putting the low segment of the full signed product
// into ret0 (r8) and the high segment into ret1 (r9).
        W       = 64              // W = number width
        .text                     // Section for code
        .align  32                // Desired alignment
        .global booth             // These three lines
        .proc   booth             //  mark the mandatory
booth:                            //   'booth' function entry
        .prologue                 // Leaf procedure
        .regstk 2,0,0,0           //  simply claims 2 ins
        .save   ar.lc,r31         // Save caller's ar.lc
        mov     r31 = ar.lc       //  in scratch register
        .body                     // Now we really begin...
first:  mov     ar.lc = W-1       // Traversals minus one
        mov     r19 = 0           // Set bit n-1 to zero
        mov     ret0 = in1        // Set R to multiplier
        mov     ret1 = 0;;        // Set L to zero
cycle:  and     r22 = 0x1,ret0;;  // Isolate lowest bit of R
        xor     r23 = r19,r22;;   // r23 = whether to act
        cmp.ne  p6,p0 = 0,r23     // p6 = whether to act
        mov     r19 = r22;;       // Bit n-1 for next time
   (p6) cmp.eq.unc p7,p8 = 0,r22;; // Add, subtract, nop?
   (p7) add     ret1 = ret1,in0   // Add X to L
   (p8) sub     ret1 = ret1,in0;; // Subtract X from L
        shrp    ret0 = ret1,ret0,1 // New R of shifted LR
        shr     ret1 = ret1,1     // New L of shifted LR
        br.cloop.sptk.few cycle;; // More cycles?
done:                             // Full product in r9,r8
        mov     ar.lc = r31       // Restore caller's ar.lc
        br.ret.sptk.many b0;;     // Back to the caller
        .endp   booth             // Mark end of procedure