diff -r 42ef8fb0d914 -r 25e772c6cb98 src/arch/x86/isa/insts/general_purpose/arithmetic/multiply_and_divide.py --- a/src/arch/x86/isa/insts/general_purpose/arithmetic/multiply_and_divide.py Fri Apr 17 15:27:26 2015 -0500 +++ b/src/arch/x86/isa/insts/general_purpose/arithmetic/multiply_and_divide.py Fri Apr 17 15:29:24 2015 -0500 @@ -249,22 +249,11 @@ def macroop DIV_%(suffix)s { %(readOp1)s - # Do the initial part of the division - div1 rdx, %(op1)s + # Initialize the division algorithm + div3 rdx, %(op1)s - #These are split out so we can initialize the number of bits in the - #second register - div2i t1, rax, "env.dataSize * 8" - div2 t1, rax, t1 - - #Loop until we're out of bits to shift in - #The amount of unrolling here could stand some tuning -divLoopTop: - div2 t1, rax, t1 - div2 t1, rax, t1 - div2 t1, rax, t1 - div2 t1, rax, t1, flags=(EZF,) - br label("divLoopTop"), flags=(nCEZF,) + # Run the algorithm + div4 rax #Unload the answer divq rax @@ -379,21 +368,11 @@ mov t1, t1, rax, flags=(nCECF,) mov t2, t2, rdx, flags=(nCECF,) - # Do the initial part of the division - div1 t2, t3 + # Initialize the division algorithm + div3 t2, t3 - #These are split out so we can initialize the number of bits in the - #second register - div2i t4, t1, "env.dataSize * 8" - div2 t4, t1, t4 - - #Loop until we're out of bits to shift in -divLoopTop: - div2 t4, t1, t4 - div2 t4, t1, t4 - div2 t4, t1, t4 - div2 t4, t1, t4, flags=(EZF,) - br label("divLoopTop"), flags=(nCEZF,) + # Run the algorithm + div4 t1 #Unload the answer divq t5 diff -r 42ef8fb0d914 -r 25e772c6cb98 src/arch/x86/isa/microops/regop.isa --- a/src/arch/x86/isa/microops/regop.isa Fri Apr 17 15:27:26 2015 -0500 +++ b/src/arch/x86/isa/microops/regop.isa Fri Apr 17 15:29:24 2015 -0500 @@ -704,6 +704,74 @@ PredezfBit = PredezfBit & ~(ext & EZFBit); ''' + # Initialize division algorithm + class Div3(WrRegOp): + code = ''' + //This is a temporary just for clarity. + uint64_t dividend = psrc1; + uint64_t divisor = op2; + // Carry out the required checks + if (divisor == 0) { + fault = std::make_shared(); + } else if (dividend >= divisor) { + // Result will overflow, so raise an exception. + fault = std::make_shared(); + } else { + //Record the initial values. + Remainder = dividend; + Divisor = divisor; + } + ''' + + # Carry out the division algorithm + class Div4(RdRegOp): + op_class = 'IntDivOp' + + code = ''' + uint64_t hi = Remainder; + uint64_t lo = psrc1; + uint64_t divisor = Divisor; + uint64_t quotient = 0; + int bits = dataSize * 8; + + // Find the lowest non-zero bit of the divisor. + int lob = 0; + while ((divisor >> lob) != 1) lob++; + + // Carry out the division algorithm that we learn in school but for + // base 2 instead of the usual base 10. At each step, we multiply + // the higher part of the dividend by 2 and then add the msb from + // the lower part of the dividend and then figure out the next bit + // of the quotient by checking if the dividend is greater than or + // equal to the divisor. In case the highest bit of the higher part + // is 1, we cannot directly multiply it by 2 since the result would + // overflow. So we add more logic to correctly carry out the + // operation. + for (int i = bits-1; i >= 0; --i) { + quotient <<= 1; + if ((hi >> 63) & 1) { + uint64_t x = (divisor >> (lob + 1)); + x = ~x; x <<= 1; x += 1; x <<= lob; + hi <<= 1; hi += x; hi += ((lo >> (bits-1)) & 1); + lo <<= 1; + } else { + hi <<= 1; + hi += ((lo >> (bits-1)) & 1); + lo <<= 1; + // DPRINTF(X86, "hi = %llu lo = %llu\\n", hi, lo); + + if (hi >= divisor) { + hi -= divisor; + quotient += 1; + } + // DPRINTF(X86, "quotient = %llu hi = %llu\\n", quotient, hi); + } + } + //Record the final results + Remainder = hi; + Quotient = quotient; + ''' + class Divq(RdRegOp): code = 'DestReg = merge(SrcReg1, Quotient, dataSize);' big_code = 'DestReg = Quotient & mask(dataSize * 8);'