// r0 = multiplicand; in this code it is set to 0x4110
	// r1 = multiplier; in this code it is set to 0x0421
	// r2 = address to store upper half of product
	// r3 = address to store lower half of product
	movi    R0, #0x10
	movis   R0, #0x41
	movi    R1, #0x21
	movis   R1, #0x04
	b       half_float_mult

uint16_mult:
	stwi    R6, R7, #0

/*** --- DO NOT EDIT THIS LINE OR ANYTHING ABOVE THIS LINE --- ***/

/* YOUR FILE-HEADER COMMENT HERE */

/*** PART 5: add a comment here that states how many CPU cycles are
    needed to run this program ****/

/*** PART 5: COPY YOUR uint16_mult implementation from HW4 HERE ***/

uint16_mult_out:
	movi    R7, #32
	ldwi    R6, R7, #0
	br      R6

/**
 * Calculate the sign bit after multiplying two half-precision
 * floating point. Registers R0 and R1 hold the operands.
 */
calc_sign:
	stwi    R6, R7, #0

/*** PART 5: Implement calc_sign() here.  ***/

/*** --- DO NOT EDIT THIS LINE OR ANYTHING BELOW THIS LINE --- ***/
calc_sign_out:
	movi    R7, #32
	ldwi    R6, R7, #0
	br      R6

half_float_mult:
	// r7 = brk (divider between stack and heap)
	movi    R7, #32
	stwi    R0, R7, #1
	stwi    R1, R7, #2

	// extract significands and then multiply them
	movi    R4, #0xff
	movis   R4, #0x03
	movi    R5, #1
	lsh     R5, R5, #-10
	// r0 = (original multiplier & 0x3ff) | (1 << 10)
	and.    R0, R0, R4
	or.     R0, R0, R5
	// r1 = (original multiplicand & 0x3ff) | (1 << 10)
	and.    R1, R1, R4
	or.     R1, R1, R5
	addi.   R2, R7, #3
	addi.   R3, R7, #4
	bl      R6, uint16_mult

	// calculate sign bit
	movi    R7, #32
	ldwi    R0, R7, #1
	ldwi    R1, R7, #2
	bl      R6, calc_sign

	// prepare registers for rest of half_float_mult()
	or.     R2, R0, R0
	movi    R7, #32
	ldwi    R0, R7, #1
	ldwi    R1, R7, #2
	movi    R3, #0
	ldwi    R4, R7, #3
	ldwi    R5, R7, #4

	// r0 = original multiplicand
	// r1 = original multiplier
	// r2 = ret_sign
	// r3 = ret_exp
	// r4 = product_upper (after calling uint16_mult())
	// r5 = product_lower (after calling uint16_mult())

	lsh     R6, R0, #15
	lsh     R7, R1, #15
	movi    R2, #0
	cmp.    R6, R7
	b.eq    calc_exp
	movis   R2, #0x80

calc_exp:
	lsh     R6, R0, #10
	andi.   R6, #0x1f
	lsh     R7, R1, #10
	andi.   R7, #0x1f
	add.    R3, R6, R7
	addi.   R3, R3, #-15

	// re-normalize product
	cmpi.   R4, #0x20
	b.lt    maybe_shift_left

	// shift product to the right
	or.     R6, R4, R4
	andi.   R6, #1
	lsh     R6, R6, #-15
	lsh     R5, R5, #1
	or.     R5, R5, R6
	lsh     R4, R4, #1
	addi.   R3, R3, #1
	b       re_assemble_float

maybe_shift_left:
	// keep shifting product to the left until normalized
	cmpi.   R4, #0x10
	b.ge    re_assemble_float
	or.     R5, R5, R5
	lsh     R6, R6, #15
	lsh     R4, R4, #-1
	or.     R4, R4, R6
	lsh     R5, R5, #-1
	addi.   R3, R3, #-1
	b       maybe_shift_left

re_assemble_float:
	// r6 = (product_upper & 0xf) << 6
	and.    R6, R4, R4
	andi.   R6, #0xf
	lsh     R6, R6, #-6
	// r7 = (product_lower & 0xfc00) >> 10
	movi    R7, #0
	movis   R7, #0xfc
	and.    R7, R5, R7
	lsh     R7, R7, #10
	// r6 = ret_significand
	or.     R6, R6, R7

	// if (product_lower & 0x0200)
	movi    R7, #0
	movis   R7, #0x02
	and.    R7, R5, R7
	// then round up
	b.eq    construct_final_value
	// r6 = (ret_significand + 1) (assuming no overflow)
	addi.   R6, R6, #1

construct_final_value:
	// construct final value
	or.     R0, R2, R2
	andi.   R3, #0x1f
	lsh     R3, R3, #-10
	or.     R0, R0, R3
	or.     R0, R0, R6

	// write floating point product to memory
	movi    R7, #32
	stwi    R0, R7, #8
	/*
	 * if your PART 5 code works, then data memory should contain:
	 *   - at address 0x23 the value 0x0014 (the product upper half)
	 *   - at address 0x24 the value 0xe710 (the product lower half)
	 *   - at address 0x28 the value 0x093a (floating point product)
	 *   - at address 0xffff the value 0xffff
	 */
	halt