************************************************************************************************
* This library is used to extend the HC11's math capabilities                                  *
************************************************************************************************ 

MULU_16_16:    PSHD               ; The HC11 has an 8-bit CPU, and so cannot deal with 16-bit
               PSHD               ; multiplication. MULU_16_16 takes two 16-bit numbers and
               LDAA  $09,SP       ; multiplies them together, placing the 32-bit result in 
               LDAB  $07,SP       ; the stack space where the two operands once occupied.
               MUL                ; This routine doesn't need any static variables, but it
               STD   $02,SP       ; does use 10 bytes of stack space, including the call to
               LDAA  $09,SP       ; the sub, and all parameter passing. A call to this sub
               LDAB  $06,SP       ; would look like this:
               MUL                ; LDD   Operand1    ; I used D to illustrate, but this should
               ADDB  $02,SP       ; PSHD              ; also work using an index register, or a
               ADCA  #0           ; LDD   Operand2    ; MOVW instruction. Placing values on the
               STD   $01,SP       ; PSHD              ; stack before the call is passing factors.
               LDAA  $08,SP       ; JSR   MULU_16_16  ; Call the sub.
               LDAB  $07,SP       ; PULD              ; Most significant word of product.
               MUL                ; PULD              ; Least significant word of product.
               ADDB  $02,SP       
               ADCA  $01,SP       ; READ THIS DAMMIT! You MUST re-adjust the stack after calling 
               STD   $01,SP       ; MULU_16_16 even if you aren't interested in the result.
               LDAA  $08,SP       ; What's more, you MUST PLACE four bytes on the stack before
               LDAB  $06,SP       ; calling MULU_16_16. If you do not do either of these things,
               MUL                ; your program will get a nice surprise when you try to RTS
               ADDB  $01,SP       ; next. Remember, this function modifies values on the stack
               ADCA  #0           ; that were placed there BEFORE the return address from the JSR
               STD   $00,SP       ; that called it. 
               PULD               ; Destroy the stack space we created at the beginning of this 
               STD   $04,SP       ; sub.
               PULD
               STD   $04,SP
               RTS


MULU_32_32:    LDD   $08,SP       ; Here we go... 32-bit by 32-bit multiply. Ready for loads of
               PSHD               ; technical detail? Here we take advantage of the routine we
               LDD   $06,SP       ; just wrote: MULU_16_16. We not only use the sub directly, but
               PSHD               ; also extend its algorithm. We need a 64-bit product (R), from
               JSR   MULU_16_16   ; two 32-bit factors (Q, P). We use the property:
               LDD   $0C,SP       ; R=(Pu*Qu*2^32)+(Pu*Ql*2^16)+(Pl*Qu*2^16)+(Pl*Ql)
               PSHD               ; to extend the reach of the HC11's puny 8-bit multiply. Also
               LDD   $08,SP       ; like the above routine, this one doesn't use any static 
               PSHD               ; memory space for operands or results. The calling procedure
               JSR   MULU_16_16   ; is similar:
               LDD   $0E,SP       ; LDD   Operand1(LSW) ; The stacking method is a little weird
               PSHD               ; PSHD                ; for people used to programming big-
               LDD   $0E,SP       ; LDD   Operand1(MSW) ; endian CPUs, the LSW of the operand is
               PSHD               ; PSHD                ; PSH'd before the MSW. It will be pulled
               JSR   MULU_16_16   ; LDD   Operand2(LSW) ; off in a logical order, however. Again,
               LDD   $00,SP       ; PSHD                ; D was used to illustrate, but the
               ADDD  $04,SP       ; LDD   Operand2(MSW) ; parameter passing could be done with
               STD   $04,SP       ; PSHD                ; MOVW's.
               LDD   $02,SP       ; JSR   MULU_32_32    ; Call the sub
               ADCB  $07,SP       ; PULD                ; Most significant word of product
               ADCA  $06,SP       ; PULD                ; Second most significant word of product
               STD   $06,SP       ; PULD                ; Third most significant word of product
               LDD   $08,SP       ; PULD                ; Least significant word of product
               ADCB  $07,SP       ; After multiplying (Pu*Qu), (Pu*Ql) and (Pl*Qu), we begin
               ADCA  $06,SP       ; adding values so we can reclaim a little stack space. Notice 
               STD   $08,SP       ; that we haven't been PUL'ing values. The stack just keeps
               LDD   $0A,SP       ; growing. Also note that MULU_32_32 is somewhat of a cycle
               ADCB  #0           ; and stack eater. On an HC11, each MUL opcode takes 10 cycles
               ADCA  #0           ; to execute, and there are 16 MUL's for each MULU_32_32 call.
               STD   $0A,SP       ; That's 160 cycles in MUL's alone. Furthurmore, the stack use
               LDD   $04,SP       ; hits a maximum of 28 bytes. Undesirable, but it might be the
               STD   $06,SP       ; only way for an HC11 to get a 64-bit result. HC12 users have
               PULD               ; the EMUL opcode which does 16-bit by 16-bit and takes only 3 
               PULD               ; cycles to complete.
               PULD               ; This sub also carries the same warning as the one above:
               LDD   $0C,SP       ; Watch your stack carefully! Before and after the call.
               PSHD               ; 
               LDD   $0A,SP       ; 
               PSHD               ; 
               JSR   MULU_16_16   ;
               LDD   $02,SP       ; 
               ADDD  $04,SP       ; 
               STD   $04,SP       ; 
               LDD   $06,SP       ; 
               ADCB  #0           ; 
               ADCA  #0           ; 
               STD   $06,SP       ; 
               LDD   $08,SP       ; 
               ADCB  #0           ; 
               ADCA  #0           ; 
               STD   $08,SP       ; 
               PULD               ; 
               STD   $00,SP       ; 
               PULD               ; 
               STD   $08,SP       ; 
               PULD               ; 
               STD   $08,SP       ; 
               PULD               ; 
               STD   $08,SP       ; 
               PULD               ; 
               STD   $08,SP       ; 
               RTS                ;