The biggest obstacle here seems to be that the flow-chart at the top of the question is either incomplete or plain incorrect. I was at first confused about the right-shifting of the divisor shown in the flow-chart, as most bit-by-bit division algorithms keep the divisor unchanged and instead apply a left shift to the concatenation of partial remainder and quotient. I then recalled that Niklaus Wirth presented just such a shift-divisor method in his book Systematic Programming.
Wirth's method is the binary analogue to the decimal long-hand division we learned in elementary school, which requires that we align the most significant digit of the divisor with the most significant digit of the dividend before starting with subtractive iterations to determine the quotient digits. In the same way Wirth's algorithm requires that we "normalize" the divisor so the most significant 1-bits of divisor and dividend are in the same bit position. It is this preparatory step that is missing from the flow chart. By simply right-shifting the divisor, the low-order bits of the divisor are lost one-by-one. The hardware diagram at the bottom of the question is likewise suspect, because a 32-bit division does not require a 64-bit ALU nor a 64-bit divisor and remainder registers. At most it requires 64-bit shift-by-one capability.
The method of normalization given by Wirth is incorrect in finite-precision unsigned integer arithmetic due to overflow in intermediate computation. A correct canonical method of divisor normalization is to count the leading zeros of the two operands, with the difference indicating how many bits the divisor needs to be shifted left. Various processor architectures have a specialized instruction for this, often called CLZ, but base RISC-V does not have such an instruction. The operation can be emulated, which is a bit cumbersome. In a 2008 research report Rodeheffer gave a modified construction which avoids the overflow problem and does not require a count-leading-zeros primitive.
As I am unfamiliar with RISC-V assembly language I am showing ISO-C99 code below that should translate 1:1 into assembly code for 32-bit RISC-V. I am showing the variant based directly on Wirth's algorithm as well as the one based on Rodeheffer's modification.
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#define USE_WIRTH (1)
#define NUM_TESTS (10000000000ULL)
#if USE_WIRTH
/* Based on N. Wirth, "Systematic Programming", 1973, section 7.3 */
/* count leading zeros */
int clz (uint32_t a)
{
uint32_t b;
int n = 32;
if ((b = (a >> 16))) { n -= 16; a = b; }
if ((b = (a >> 8))) { n -= 8; a = b; }
if ((b = (a >> 4))) { n -= 4; a = b; }
if ((b = (a >> 2))) { n -= 2; a = b; }
if (( (a >> 1))) return n - 2;
return n - (int)a;
}
uint32_t udiv (uint32_t x, uint32_t y)
{
uint32_t remainder = x;
uint32_t quotient = 0;
uint32_t divisor = y;
int reps;
if (x >= y) {
reps = clz (divisor) - clz (remainder); // bits in quotient
divisor = divisor << reps; // normalize divisor
do {
remainder = remainder - divisor; // preliminary partial remainder
if ((int32_t)remainder >= 0) { // partial remainder non-negative
quotient = quotient << 1 | 1; // record quotient bit: 1
} else {
remainder = remainder + divisor;// restore remainder
quotient = quotient << 1 | 0; // record quotient bit: 0
}
divisor = divisor >> 1; // halve divisor
reps = reps - 1;
} while (reps >= 0); // until all quotient bits done
}
return quotient;
}
#else // !USE_WIRTH
/* Based on Thomas L. Rodeheffer, "Software Integer Division".
Microsoft Research Report, Apr. 2008, fig. 2
*/
uint32_t udiv (uint32_t x, uint32_t y)
{
uint32_t remainder = x;
uint32_t quotient = 0;
uint32_t divisor = y;
if (x >= divisor) {
x = x - divisor;
while (x >= divisor) { // normalize divisor
x = x - divisor;
divisor = divisor << 1;
}
do {
remainder = remainder - divisor; // preliminary partial remainder
if ((int32_t)remainder >= 0) { // partial remainder non-negative
quotient = (quotient << 1) | 1; // record quotient bit: 1
} else {
remainder = remainder + divisor;// restore remainder
quotient = (quotient << 1) | 0; // record quotient bit: 0
}
if (divisor == y) break;
divisor = divisor >> 1; // halve divisor
} while (1);
}
return quotient;
}
#endif // USE_WIRTH
// George Marsaglia's KISS PRNG, period 2**123. Newsgroup sci.math, 21 Jan 1999
// Bug fix: Greg Rose, "KISS: A Bit Too Simple" http://eprint.iacr.org/2011/007
static uint32_t kiss_z=362436069, kiss_w=521288629;
static uint32_t kiss_jsr=123456789, kiss_jcong=380116160;
#define znew (kiss_z=36969*(kiss_z&65535)+(kiss_z>>16))
#define wnew (kiss_w=18000*(kiss_w&65535)+(kiss_w>>16))
#define MWC ((znew<<16)+wnew )
#define SHR3 (kiss_jsr^=(kiss_jsr<<13),kiss_jsr^=(kiss_jsr>>17), \
kiss_jsr^=(kiss_jsr<<5))
#define CONG (kiss_jcong=69069*kiss_jcong+1234567)
#define KISS ((MWC^CONG)+SHR3)
int main (void)
{
unsigned long long int count = 0;
uint32_t dividend, divisor, res, ref;
do {
dividend = KISS;
do {
divisor = KISS;
} while (divisor == 0);
ref = dividend / divisor;
res = udiv (dividend, divisor);
if (res != ref) {
printf ("dividend=%08x divisor=%08x res=%08x ref=%08x\n", dividend, divisor, res, ref);
return EXIT_FAILURE;
}
count++;
if ((count & 0xffffff) == 0) printf ("\rcount=%llu", count);
} while (count < NUM_TESTS);
return EXIT_SUCCESS;
}
I noticed belatedly that flowchart and hardware diagram in the question are from Patterson and Hennessy, Computer Organization & Design: The Hardware / Software Interface, 1994, section 4.7. As such, it seems to represent a didactic rather than a real-life example. The only way I can make work what is being described is by emulating a subtraction with carry-out to indicate whether the partial remainder is negative after subtraction:
/* Use SPARC semantics: For a subtraction, carry is set to 1 if the subtraction
produced a borrow and to 0 otherwise.
*/
/* subtraction with carry-out */
#define SUBcc(a,b,cy,t0,t1) \
(t0=(b), t1=(a), cy=t1<t0, t1-t0)
uint32_t udiv (uint32_t x, uint32_t y)
{
uint64_t divisor = ((uint64_t)y) << 32;
uint64_t remainder = x;
uint64_t t0, t1;
uint32_t cy, quotient = 0;
int reps = 0;
do {
remainder = SUBcc (remainder, divisor, cy, t0, t1); // new remainder
if (cy == 0) { // remainder non-negative
quotient = quotient << 1 | 1; // record quotient bit: 1
} else {
remainder = remainder + divisor;// restore old remainder
quotient = quotient << 1 | 0; // record quotient bit: 0
}
divisor = divisor >> 1; // halve divisor
reps = reps + 1;
} while (reps < 33);
return quotient;
}
On a 32-bit RISC-V platform, all of the 64-bit operations would have to be emulated:
/* addition with carry-out */
#define ADDcc(a,b,cy,t0,t1) \
(t0=(b), t1=(a), t0=t0+t1, cy=t0<t1, t0=t0)
/* addition with carry-in */
#define ADDC(a,b,cy,t0,t1) \
(t0=(b)+cy, t1=(a), t0+t1)
/* addition with carry-in and carry-out */
#define ADDCcc(a,b,cy,t0,t1) \
(t0=(b)+cy, t1=(a), cy=t0<cy, t0=t0+t1, t1=t0<t1, cy=cy+t1, t0=t0)
/* Use SPARC semantics: For a subtraction, carry is set to 1 if the subtraction
produced a borrow and to 0 otherwise.
*/
/* subtraction with carry-out */
#define SUBcc(a,b,cy,t0,t1) \
(t0=(b), t1=(a), cy=t1<t0, t1-t0)
/* subtraction with carry-in */
#define SUBC(a,b,cy,t0,t1) \
(t0=(b)+cy, t1=(a), t1-t0)
/* subtraction with carry-in and carry-out*/
#define SUBCcc(a,b,cy,t0,t1,t2) \
(t0=(b)+cy, t1=(a), cy=t0<cy, t2=t1<t0, cy=cy+t2, t1-t0)
uint32_t udiv (uint32_t x, uint32_t y)
{
uint32_t divisor_hi = y;
uint32_t divisor_lo = 0;
uint32_t remainder_hi = 0;
uint32_t remainder_lo = x;
uint32_t quotient = 0;
uint32_t cy, t0, t1, t2;
int reps = 0;
do {
remainder_lo = SUBcc (remainder_lo, divisor_lo, cy, t0, t1);
remainder_hi = SUBCcc (remainder_hi, divisor_hi, cy, t0, t1, t2);
if (cy == 0) { // partial remainder non-negative
quotient = quotient << 1 | 1; // record quotient bit: 1
} else {
remainder_lo = ADDcc (remainder_lo, divisor_lo, cy, t0, t1);
remainder_hi = ADDC (remainder_hi, divisor_hi, cy, t0, t1);
quotient = quotient << 1 | 0; // record quotient bit: 0
}
divisor_lo = (divisor_lo >> 1) | (divisor_hi << 31);
divisor_hi = divisor_hi >> 1;
reps = reps + 1;
} while (reps < 33);
return quotient;
}
I conclude that the example chosen by Patterson and Hennessy is an exceedingly poor choice for an initial introduction to binary division algorithms.
t6? Why? Did you mean to sett6to 32? If so, that's not how, as that adds 32 tot6. You're not showing the code forcontrollo_segno_div, so we can't help beyond that. Except perhaps to state thatt6is a call-clobbered register, so should not be expected to survive a call to anything.