/usr/src/gcc-4.4/debian/patches/arm-thumb2-speedup-division.diff is in gcc-4.4-source 4.4.7-1ubuntu2.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 | # DP: ARM: speed up division on Thumb-2 (backport from the trunk)
2009-08-06 Paul Brook <paul@codesourcery.com>
gcc/
* config/arm/lib1funcs.asm (ARM_DIV_BODY): Add Thumb-2 implementation.
(udivsi3, aeabi_uidivmod, divsi3, aeabi_idivmod): Only use Thumb-1
implementation on ARMv6-M.
---
src/gcc/config/arm/lib1funcs.asm | 63 +++++++++++++++++++++++++++++++++------
1 file changed, 54 insertions(+), 9 deletions(-)
--- a/src/gcc/config/arm/lib1funcs.asm
+++ b/src/gcc/config/arm/lib1funcs.asm
@@ -436,10 +436,31 @@ pc .req r15
/* ------------------------------------------------------------------------ */
.macro ARM_DIV_BODY dividend, divisor, result, curbit
#if __ARM_ARCH__ >= 5 && ! defined (__OPTIMIZE_SIZE__)
+#if defined (__thumb2__)
+ clz \curbit, \dividend
+ clz \result, \divisor
+ sub \curbit, \result, \curbit
+ rsb \curbit, \curbit, #31
+ adr \result, 1f
+ add \curbit, \result, \curbit, lsl #4
+ mov \result, #0
+ mov pc, \curbit
+.p2align 3
+1:
+ .set shift, 32
+ .rept 32
+ .set shift, shift - 1
+ cmp.w \dividend, \divisor, lsl #shift
+ nop.n
+ adc.w \result, \result, \result
+ it cs
+ subcs.w \dividend, \dividend, \divisor, lsl #shift
+ .endr
+#else
clz \curbit, \dividend
clz \result, \divisor
sub \curbit, \result, \curbit
rsbs \curbit, \curbit, #31
addne \curbit, \curbit, \curbit, lsl #1
@@ -451,10 +472,11 @@ pc .req r15
.set shift, shift - 1
cmp \dividend, \divisor, lsl #shift
adc \result, \result, \result
subcs \dividend, \dividend, \divisor, lsl #shift
.endr
+#endif
#else /* __ARM_ARCH__ < 5 || defined (__OPTIMIZE_SIZE__) */
#if __ARM_ARCH__ >= 5
clz \curbit, \divisor
@@ -498,22 +520,27 @@ pc .req r15
#endif /* __ARM_ARCH__ < 5 */
@ Division loop
1: cmp \dividend, \divisor
+ do_it hs, t
subhs \dividend, \dividend, \divisor
orrhs \result, \result, \curbit
cmp \dividend, \divisor, lsr #1
+ do_it hs, t
subhs \dividend, \dividend, \divisor, lsr #1
orrhs \result, \result, \curbit, lsr #1
cmp \dividend, \divisor, lsr #2
+ do_it hs, t
subhs \dividend, \dividend, \divisor, lsr #2
orrhs \result, \result, \curbit, lsr #2
cmp \dividend, \divisor, lsr #3
+ do_it hs, t
subhs \dividend, \dividend, \divisor, lsr #3
orrhs \result, \result, \curbit, lsr #3
cmp \dividend, #0 @ Early termination?
+ do_it hs, t
movnes \curbit, \curbit, lsr #4 @ No, any more bits to do?
movne \divisor, \divisor, lsr #4
bne 1b
#endif /* __ARM_ARCH__ < 5 || defined (__OPTIMIZE_SIZE__) */
@@ -798,15 +825,15 @@ LSYM(Lgot_result):
/* ------------------------------------------------------------------------ */
/* Start of the Real Functions */
/* ------------------------------------------------------------------------ */
#ifdef L_udivsi3
+#if defined(__ARM_ARCH_6M__)
+
FUNC_START udivsi3
FUNC_ALIAS aeabi_uidiv udivsi3
-#ifdef __thumb__
-
cmp divisor, #0
beq LSYM(Ldiv0)
mov curbit, #1
mov result, #0
@@ -818,13 +845,17 @@ LSYM(Lgot_result):
mov r0, result
pop { work }
RET
-#else /* ARM version. */
+#else /* ARM version/Thumb-2. */
+
+ ARM_FUNC_START udivsi3
+ ARM_FUNC_ALIAS aeabi_uidiv udivsi3
subs r2, r1, #1
+ do_it eq
RETc(eq)
bcc LSYM(Ldiv0)
cmp r0, r1
bls 11f
tst r1, r2
@@ -833,11 +864,12 @@ LSYM(Lgot_result):
ARM_DIV_BODY r0, r1, r2, r3
mov r0, r2
RET
-11: moveq r0, #1
+11: do_it eq, e
+ moveq r0, #1
movne r0, #0
RET
12: ARM_DIV2_ORDER r1, r2
@@ -846,19 +878,20 @@ LSYM(Lgot_result):
#endif /* ARM version */
DIV_FUNC_END udivsi3
+#if defined(__ARM_ARCH_6M__)
FUNC_START aeabi_uidivmod
-#ifdef __thumb__
push {r0, r1, lr}
bl SYM(__udivsi3)
POP {r1, r2, r3}
mul r2, r0
sub r1, r1, r2
bx r3
#else
+ARM_FUNC_START aeabi_uidivmod
stmfd sp!, { r0, r1, lr }
bl SYM(__udivsi3)
ldmfd sp!, { r1, r2, lr }
mul r3, r2, r0
sub r1, r1, r3
@@ -909,14 +942,15 @@ LSYM(Lover10):
#endif /* L_umodsi3 */
/* ------------------------------------------------------------------------ */
#ifdef L_divsi3
+#if defined(__ARM_ARCH_6M__)
+
FUNC_START divsi3
FUNC_ALIAS aeabi_idiv divsi3
-#ifdef __thumb__
cmp divisor, #0
beq LSYM(Ldiv0)
push { work }
mov work, dividend
@@ -944,60 +978,71 @@ LSYM(Lover11):
neg r0, r0
LSYM(Lover12):
pop { work }
RET
-#else /* ARM version. */
+#else /* ARM/Thumb-2 version. */
+ ARM_FUNC_START divsi3
+ ARM_FUNC_ALIAS aeabi_idiv divsi3
+
cmp r1, #0
eor ip, r0, r1 @ save the sign of the result.
beq LSYM(Ldiv0)
+ do_it mi
rsbmi r1, r1, #0 @ loops below use unsigned.
subs r2, r1, #1 @ division by 1 or -1 ?
beq 10f
movs r3, r0
+ do_it mi
rsbmi r3, r0, #0 @ positive dividend value
cmp r3, r1
bls 11f
tst r1, r2 @ divisor is power of 2 ?
beq 12f
ARM_DIV_BODY r3, r1, r0, r2
cmp ip, #0
+ do_it mi
rsbmi r0, r0, #0
RET
10: teq ip, r0 @ same sign ?
+ do_it mi
rsbmi r0, r0, #0
RET
-11: movlo r0, #0
+11: do_it lo
+ movlo r0, #0
+ do_it eq,t
moveq r0, ip, asr #31
orreq r0, r0, #1
RET
12: ARM_DIV2_ORDER r1, r2
cmp ip, #0
mov r0, r3, lsr r2
+ do_it mi
rsbmi r0, r0, #0
RET
#endif /* ARM version */
DIV_FUNC_END divsi3
+#if defined(__ARM_ARCH_6M__)
FUNC_START aeabi_idivmod
-#ifdef __thumb__
push {r0, r1, lr}
bl SYM(__divsi3)
POP {r1, r2, r3}
mul r2, r0
sub r1, r1, r2
bx r3
#else
+ARM_FUNC_START aeabi_idivmod
stmfd sp!, { r0, r1, lr }
bl SYM(__divsi3)
ldmfd sp!, { r1, r2, lr }
mul r3, r2, r0
sub r1, r1, r3
|