/*
 * Optimized IEEE 754 double precision arithmetic for the ARM architecture.
 *
 * Author:      Nicolas Pitre, <npitre@mvista.com> or <nico@cam.org>
 * Created:     Feb 07, 2003
 * Copyright:   (C) Monta Vista Software, Inc.
 *
 * This file is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by the
 * Free Software Foundation; either version 2, or (at your option) any
 * later version.
 *
 * In addition to the permissions in the GNU General Public License, the
 * author gives you unlimited permission to link the compiled version of
 * this file into combinations with other programs, and to distribute
 * those combinations without any restriction coming from the use of this
 * file.  (The General Public License restrictions do apply in other
 * respects; for example, they cover modification of the file, and
 * distribution when not linked into a combine executable.)
 *
 * This file is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 * 
 * Notes: 
 * 
 * The goal of this code is to be as fast as possible.  This is
 * not meant to be easy to understand for the casual reader.
 * For slightly simpler code please see the single precision version
 * of this file.
 * 
 * Only the default rounding mode is intended for best performances.
 * Exceptions aren't supported yet, but that can be added quite easily
 * if necessary without impacting performances.
 */

@ This selects the minimum architecture level required.
#define __ARM_ARCH__ 4

@ Current double representation is always big endian on ARM.
@ If ever this historical oddity is removed then only redefining the
@ following macros for a little endian configuration will do the trick.
#define xh r0
#define xl r1
#define yh r2
#define yl r3

	.global	__negdf2
	.global	__subdf3
	.global	__adddf3
	.global	__floatunsidf
	.global	__floatsidf
	.global	__extendsfdf2
	.global	__muldf3
	.global	__divdf3
	.global	__gedf2
	.global	__gtdf2
	.global	__ledf2
	.global	__ltdf2
	.global	__nedf2
	.global	__eqdf2
	.global	__cmpdf2
	.global	__unorddf2
	.global	__fixdfsi
	.global	__fixunsdfsi
	.global	__truncdfsf2


__negdf2:
	@ flip sign bit
	eor	xh, xh, #0x80000000
	mov	pc, lr

__subdf3:
	@ flip sign bit of second arg
	eor	yh, yh, #0x80000000

__adddf3:

	@ Compare both args, return zero if equal but the sign.
	teq	xl, yl
	eoreq	ip, xh, yh
	teqeq	ip, #0x80000000
	beq	.ad_z

	@ If first arg is 0 or -0, return second arg.
	@ If second arg is 0 or -0, return first arg.
	orrs	ip, xl, xh, lsl #1
	moveq	xl, yl
	moveq	xh, yh
	orrnes	ip, yl, yh, lsl #1
	moveq	pc, lr

	stmfd	sp!, {r4, r5, lr}

	@ Mask out exponents.
	mov	ip, #0x7f000000
	orr	ip, ip, #0x00f00000
	and	r4, xh, ip
	and	r5, yh, ip

	@ If either of them is 0x7ff, result will be INF or NAN
	teq	r4, ip
	teqne	r5, ip
	beq	.ad_i

	@ Compute exponent difference.  Make largest exponent in r4,
	@ corresponding arg in xh-xl, and positive exponent difference in r5.
	subs	r5, r5, r4
	rsblt	r5, r5, #0
	ble	1f
	add	r4, r4, r5
	eor	yl, xl, yl
	eor	yh, xh, yh
	eor	xl, yl, xl
	eor	xh, yh, xh
	eor	yl, xl, yl
	eor	yh, xh, yh
1:

	@ If exponent difference is too large, return largest argument
	@ already in xh-xl.  We need up to 54 bit to handle proper rounding
	@ of 0x1p54 - 1.1.
	cmp	r5, #(54 << 20)
	ldmhifd	sp!, {r4, r5, pc}

	@ Convert mantissa to signed integer.
	orr	ip, ip, #0x80000000
	tst	xh, #0x80000000
	bic	xh, xh, ip
	orr	xh, xh, #0x00100000
	beq	1f
	rsbs	xl, xl, #0
	rsc	xh, xh, #0
1:
	tst	yh, #0x80000000
	bic	yh, yh, ip
	orr	yh, yh, #0x00100000
	beq	1f
	rsbs	yl, yl, #0
	rsc	yh, yh, #0
1:
	@ If exponent == difference, one or both args were denormalized.
	@ Since this is not common case, rescale them off line.
	teq	r4, r5
	beq	.ad_d
.ad_x:
	@ Scale down second arg with exponent difference.
	@ Apply shift one bit left to first arg and the rest to second arg
	@ to simplify things later, but only if exponent does not become 0.
	mov	ip, #0
	movs	r5, r5, lsr #20
	beq	3f
	teq	r4, #(1 << 20)
	beq	1f
	movs	xl, xl, lsl #1
	adc	xh, ip, xh, lsl #1
	sub	r4, r4, #(1 << 20)
	subs	r5, r5, #1
	beq	3f

	@ Shift yh-yl right per r5, keep leftover bits into ip.
1:	rsbs	lr, r5, #32
	blt	2f
	mov	ip, yl, lsl lr
	mov	yl, yl, lsr r5
	orr	yl, yl, yh, lsl lr
	mov	yh, yh, asr r5
	b	3f
2:	sub	r5, r5, #32
	add	lr, lr, #32
	cmp	yl, #1
	adc	ip, ip, yh, lsl lr
	mov	yl, yh, asr r5
	mov	yh, yh, asr #32
3:
	@ the actual addition
	adds	xl, xl, yl
	adc	xh, xh, yh

	@ We now have a result in xh-xl-ip.
	@ Keep absolute value in xh-xl-ip, sign in r5.
	ands	r5, xh, #0x80000000
	bpl	.ad_p
	rsbs	ip, ip, #0
	rscs	xl, xl, #0
	rsc	xh, xh, #0

	@ Determine how to normalize the result.
.ad_p:	cmp	xh, #0x00100000
	bcc	.ad_l
	cmp	r0, #0x00200000
	bcc	.ad_r0
	cmp	r0, #0x00400000
	bcc	.ad_r1

	@ Result needs to be shifted right.
	movs	xh, xh, lsr #1
	movs	xl, xl, rrx
	movs	ip, ip, rrx
	orrcs	ip, ip, #1
	add	r4, r4, #(1 << 20)
.ad_r1:	movs	xh, xh, lsr #1
	movs	xl, xl, rrx
	movs	ip, ip, rrx
	orrcs	ip, ip, #1
	add	r4, r4, #(1 << 20)

	@ Our result is now properly aligned into xh-xl, remaining bits in ip.
	@ Round with MSB of ip. If halfway between two numbers, round towards
	@ LSB of xl = 0.
.ad_r0:	adds	xl, xl, ip, lsr #31
	adc	xh, xh, #0
	teq	ip, #0x80000000
	biceq	xl, xl, #1

	@ One extreme rounding case may add a new MSB.  Adjust exponent.
	@ That MSB will be cleared when exponent is merged below. 
	tst	xh, #0x00200000
	addne	r4, r4, #(1 << 20)

	@ Make sure we did not bust our exponent.
	mov	ip, #0x7f000000
	orr	ip, ip, #0x00f00000
	cmp	r4, ip
	bhs	.ad_o

	@ Pack final result together.
.ad_e:	bic	xh, xh, #0x00300000
	orr	xh, xh, r4
	orr	xh, xh, r5
	ldmfd	sp!, {r4, r5, pc}

.ad_l:	@ Result must be shifted left and exponent adjusted.
	@ No rounding necessary since ip will always be 0.
#if __ARM_ARCH__ < 5

	teq	xh, #0
	movne	r3, #-11
	moveq	r3, #21
	moveq	xh, xl
	moveq	xl, #0
	mov	r2, xh
	movs	ip, xh, lsr #16
	moveq	r2, r2, lsl #16
	addeq	r3, r3, #16
	tst	r2, #0xff000000
	moveq	r2, r2, lsl #8
	addeq	r3, r3, #8
	tst	r2, #0xf0000000
	moveq	r2, r2, lsl #4
	addeq	r3, r3, #4
	tst	r2, #0xc0000000
	moveq	r2, r2, lsl #2
	addeq	r3, r3, #2
	tst	r2, #0x80000000
	addeq	r3, r3, #1

#else

	teq	xh, #0
	moveq	xh, xl
	moveq	xl, #0
	clz	r3, xh
	addeq	r3, r3, #32
	sub	r3, r3, #11

#endif

	@ determine how to shift the value.
	subs	r2, r3, #32
	bge	2f
	adds	r2, r2, #12
	ble	1f

	@ shift value left 21 to 31 bits, or actually right 11 to 1 bits
	@ since a register switch happened above.
	add	ip, r2, #20
	rsb	r2, r2, #12
	mov	xl, xh, lsl ip
	mov	xh, xh, lsr r2
	b	3f

	@ actually shift value left 1 to 20 bits, which might also represent
	@ 32 to 52 bits if counting the register switch that happened earlier.
1:	add	r2, r2, #20
2:	rsble	ip, r2, #32
	mov	xh, xh, lsl r2
	orrle	xh, xh, xl, lsr ip
	movle	xl, xl, lsl r2

	@ adjust exponent accordingly.
3:	subs	r4, r4, r3, lsl #20
	bgt	.ad_e

	@ Exponent too small, denormalize result.
	@ Find out proper shift value.
	mvn	r4, r4, asr #20
	subs	r4, r4, #30
	bge	2f
	adds	r4, r4, #12
	bgt	1f

	@ shift result right of 1 to 20 bits, sign is in r5.
	add	r4, r4, #20
	rsb	r2, r4, #32
	mov	xl, xl, lsr r4
	orr	xl, xl, xh, lsl r2
	orr	xh, r5, xh, lsr r4
	ldmfd	sp!, {r4, r5, pc}

	@ shift result right of 21 to 31 bits, or left 11 to 1 bits after
	@ a register switch from xh to xl.
1:	rsb	r4, r4, #12
	rsb	r2, r4, #32
	mov	xl, xl, lsr r2
	orr	xl, xl, xh, lsl r4
	mov	xh, r5
	ldmfd	sp!, {r4, r5, pc}

	@ Shift value right of 32 to 64 bits, or 0 to 32 bits after a switch
	@ from xh to xl.
2:	mov	xl, xh, lsr r4
	mov	xh, r5
	ldmfd	sp!, {r4, r5, pc}

	@ Adjust exponents for denormalized arguments.
.ad_d:	teq	r4, #0
	eoreq	xh, xh, #0x00100000
	addeq	r4, r4, #(1 << 20)
	eor	yh, yh, #0x00100000
	subne	r5, r5, #(1 << 20)
	b	.ad_x

	@ Result is x - x = 0, unless x = INF or NAN.
.ad_z:	mov	ip, #0x7f000000
	orr	ip, ip, #0x00f00000
	and	r2, xh, ip
	teq	r2, ip
	orreq	xh, ip, #0x00080000
	movne	xh, #0
	mov	xl, #0
	mov	pc, lr

	@ Overflow: return INF.
.ad_o:	orr	xh, r5, #0x7f000000
	orr	xh, xh, #0x00f00000
	mov	xl, #0
	ldmfd	sp!, {r4, r5, pc}

	@ At least one of x or y is INF/NAN.
	@   if xh-xl != INF/NAN: return yh-yl (which is INF/NAN)
	@   if yh-yl != INF/NAN: return xh-xl (which is INF/NAN)
	@   if either is NAN: return NAN
	@   if opposite sign: return NAN
	@   return xh-xl (which is INF or -INF)
.ad_i:	teq	r4, ip
	movne	xh, yh
	movne	xl, yl
	teqeq	r5, ip
	ldmnefd	sp!, {r4, r5, pc}
	orrs	r4, xl, xh, lsl #12
	orreqs	r4, yl, yh, lsl #12
	teqeq	xh, yh
	orrne	xh, r5, #0x00080000
	movne	xl, #0
	ldmfd	sp!, {r4, r5, pc}


__floatunsidf:
	teq	r0, #0
	moveq	r1, #0
	moveq	pc, lr
	stmfd	sp!, {r4, r5, lr}
	mov	r4, #(0x400 << 20)	@ initial exponent
	add	r4, r4, #((52-1) << 20)
	mov	r5, #0			@ sign bit is 0
	mov	xl, r0
	mov	xh, #0
	b	.ad_l


__floatsidf:
	teq	r0, #0
	moveq	r1, #0
	moveq	pc, lr
	stmfd	sp!, {r4, r5, lr}
	mov	r4, #(0x400 << 20)	@ initial exponent
	add	r4, r4, #((52-1) << 20)
	ands	r5, r0, #0x80000000	@ sign bit in r5
	rsbmi	r0, r0, #0		@ absolute value
	mov	xl, r0
	mov	xh, #0
	b	.ad_l


__extendsfdf2:
	movs	r2, r0, lsl #1
	beq	1f			@ value is 0.0 or -0.0
	mov	xh, r2, asr #3		@ stretch exponent
	mov	xh, xh, rrx		@ retrieve sign bit
	mov	xl, r2, lsl #28		@ retrieve remaining bits
	ands	r2, r2, #0xff000000	@ isolate exponent
	beq	2f			@ exponent was 0 but not mantissa
	teq	r2, #0xff000000		@ check if INF or NAN
	eorne	xh, xh, #0x38000000	@ fixup exponent otherwise.
	mov	pc, lr

1:	mov	xh, r0
	mov	xl, #0
	mov	pc, lr

2:	@ value was denormalized.  We can normalize it now.
	stmfd	sp!, {r4, r5, lr}
	mov	r4, #(0x380 << 20)	@ setup corresponding exponent
	add	r4, r4, #(1 << 20)
	and	r5, xh, #0x80000000	@ move sign bit in r5
	bic	xh, xh, #0x80000000
	b	.ad_l


__muldf3:

	stmfd	sp!, {r4, r5, r6, lr}

	@ Mask out exponents.
	mov	ip, #0x7f000000
	orr	ip, ip, #0x00f00000
	and	r4, xh, ip
	and	r5, yh, ip

	@ Trap any INF/NAN.
	teq	r4, ip
	teqne	r5, ip
	beq	.ml_s

	@ Trap any multiplication by 0.
	orrs	r6, xl, xh, lsl #1
	orrnes	r6, yl, yh, lsl #1
	beq	.ml_z

	@ Shift exponents right one bit to make room for overflow bit.
	@ If either of them is 0, scale denormalized arguments off line.
	@ Then add both exponents together.
	movs	r4, r4, lsr #1
	teqne	r5, #0
	beq	.ml_d
.ml_x:	add	r4, r4, r5, asr #1

	@ Preserve final sign in r4 along with exponent for now.
	teq	xh, yh
	orrmi	r4, r4, #0x8000

	@ Convert mantissa to unsigned integer.
	orr	ip, ip, #0x80000000
	bic	xh, xh, ip
	bic	yh, yh, ip
	orr	xh, xh, #0x00100000
	orr	yh, yh, #0x00100000

#if __ARM_ARCH__ < 4

	@ Well, no way to make it shorter without the umull instruction.
	@ We must perform that 53 x 53 bit multiplication by hand.
	stmfd	sp!, {r7, r8, r9, sl, fp}
	mov	r7, xl, lsr #16
	mov	r8, yl, lsr #16
	mov	r9, xh, lsr #16
	mov	sl, yh, lsr #16
	bic	xl, xl, r7, lsl #16
	bic	yl, yl, r8, lsl #16
	bic	xh, xh, r9, lsl #16
	bic	yh, yh, sl, lsl #16
	mul	ip, xl, yl
	mul	fp, xl, r8
	mov	lr, #0
	adds	ip, ip, fp, lsl #16
	adc	lr, lr, fp, lsr #16
	mul	fp, r7, yl
	adds	ip, ip, fp, lsl #16
	adc	lr, lr, fp, lsr #16
	mul	fp, xl, sl
	mov	r5, #0
	adds	lr, lr, fp, lsl #16
	adc	r5, r5, fp, lsr #16
	mul	fp, r7, yh
	adds	lr, lr, fp, lsl #16
	adc	r5, r5, fp, lsr #16
	mul	fp, xh, r8
	adds	lr, lr, fp, lsl #16
	adc	r5, r5, fp, lsr #16
	mul	fp, r9, yl
	adds	lr, lr, fp, lsl #16
	adc	r5, r5, fp, lsr #16
	mul	fp, xh, sl
	mul	r6, r9, sl
	adds	r5, r5, fp, lsl #16
	adc	r6, r6, fp, lsr #16
	mul	fp, r9, yh
	adds	r5, r5, fp, lsl #16
	adc	r6, r6, fp, lsr #16
	mul	fp, xl, yh
	adds	lr, lr, fp
	mul	fp, r7, sl
	adcs	r5, r5, fp
	mul	fp, xh, yl
	adc	r6, r6, #0
	adds	lr, lr, fp
	mul	fp, r9, r8
	adcs	r5, r5, fp
	mul	fp, r7, r8
	adc	r6, r6, #0
	adds	lr, lr, fp
	mul	fp, xh, yh
	adcs	r5, r5, fp
	adc	r6, r6, #0
	ldmfd	sp!, {r7, r8, r9, sl, fp}

#else

	@ Here is the actual multiplication: 53 bits * 53 bits -> 106 bits.
	umull	ip, lr, xl, yl
	mov	r5, #0
	umlal	lr, r5, xl, yh
	umlal	lr, r5, xh, yl
	mov	r6, #0
	umlal	r5, r6, xh, yh

#endif

	@ The LSBs in ip are only significant for the final rounding.
	@ Fold them into one bit of lr.
	teq	ip, #0
	orrne	lr, lr, #1

	@ Put final sign in xh.
	mov	xh, r4, lsl #16
	bic	r4, r4, #0x8000

	@ Adjust result if one extra MSB appeared (one of four times).
	tst	r6, #(1 << 9)
	beq	1f
	add	r4, r4, #(1 << 19)
	movs	r6, r6, lsr #1
	movs	r5, r5, rrx
	movs	lr, lr, rrx
	orrcs	lr, lr, #1
1:
	@ Scale back to 53 bits.
	@ xh contains sign bit already.
	orr	xh, xh, r6, lsl #12
	orr	xh, xh, r5, lsr #20
	mov	xl, r5, lsl #12
	orr	xl, xl, lr, lsr #20

	@ Apply exponent bias, check range for underflow.
	mov	ip, #0x1f000000
	orr	ip, ip, #0x00f80000
	subs	r4, r4, ip
	ble	.ml_u

	@ Round the result.
	movs	lr, lr, lsl #12
	bpl	1f
	adds	xl, xl, #1
	adc	xh, xh, #0
	teq	lr, #0x80000000
	biceq	xl, xl, #1

	@ Rounding may have produced an extra MSB here.
	@ The extra bit is cleared before merging the exponent below.
	tst	xh, #0x00200000
	addne	r4, r4, #(1 << 19)
1:
	@ Check exponent for overflow.
	orr	ip, ip, #0x20000000
	cmp	r4, ip
	bge	.ml_o

	@ Add final exponent.
	bic	xh, xh, #0x00300000
	orr	xh, xh, r4, lsl #1
	ldmfd	sp!, {r4, r5, r6, pc}

	@ Result is 0, but determine sign anyway.
.ml_z:	eor	xh, xh, yh
.dv_z:	bic	xh, xh, #0x7fffffff
	mov	xl, #0
	ldmfd	sp!, {r4, r5, r6, pc}

	@ Check if denormalized result is possible, otherwise return signed 0.
.ml_u:	cmn	r4, #(53 << 19)
	movle	xl, #0
	ldmlefd	sp!, {r4, r5, r6, pc}

	@ Find out proper shift value.
.ml_r:	mvn	r4, r4, asr #19
	subs	r4, r4, #30
	bge	2f
	adds	r4, r4, #12
	bgt	1f

	@ shift result right of 1 to 20 bits, preserve sign bit, round, etc.
	add	r4, r4, #20
	rsb	r5, r4, #32
	mov	r3, xl, lsl r5
	mov	xl, xl, lsr r4
	orr	xl, xl, xh, lsl r5
	movs	xh, xh, lsl #1
	mov	xh, xh, lsr r4
	mov	xh, xh, rrx
	adds	xl, xl, r3, lsr #31
	adc	xh, xh, #0
	teq	lr, #0
	teqeq	r3, #0x80000000
	biceq	xl, xl, #1
	ldmfd	sp!, {r4, r5, r6, pc}

	@ shift result right of 21 to 31 bits, or left 11 to 1 bits after
	@ a register switch from xh to xl. Then round.
1:	rsb	r4, r4, #12
	rsb	r5, r4, #32
	mov	r3, xl, lsl r4
	mov	xl, xl, lsr r5
	orr	xl, xl, xh, lsl r4
	bic	xh, xh, #0x7fffffff
	adds	xl, xl, r3, lsr #31
	adc	xh, xh, #0
	teq	lr, #0
	teqeq	r3, #0x80000000
	biceq	xl, xl, #1
	ldmfd	sp!, {r4, r5, r6, pc}

	@ Shift value right of 32 to 64 bits, or 0 to 32 bits after a switch
	@ from xh to xl.  Leftover bits are in r3-r6-lr for rounding.
2:	rsb	r5, r4, #32
	mov	r6, xl, lsl r5
	mov	r3, xl, lsr r4
	orr	r3, r3, xh, lsl r5
	mov	xl, xh, lsr r4
	bic	xh, xh, #0x7fffffff
	adds	xl, xl, r3, lsr #31
	adc	xh, xh, #0
	orrs	r6, r6, lr
	teqeq	r3, #0x80000000
	biceq	xl, xl, #1
	ldmfd	sp!, {r4, r5, r6, pc}

	@ One or both arguments are denormalized.
	@ Scale them leftwards and preserve sign bit.
.ml_d:	mov	lr, #0
	teq	r4, #0
	bne	2f
	and	r6, xh, #0x80000000
1:	movs	xl, xl, lsl #1
	adc	xh, lr, xh, lsl #1
	tst	xh, #0x00100000
	subeq	r4, r4, #(1 << 19)
	beq	1b
	orr	xh, xh, r6
2:	teq	r5, #0
	bne	.ml_x
	and	r6, yh, #0x80000000
3:	movs	yl, yl, lsl #1
	adc	yh, lr, yh, lsl #1
	tst	yh, #0x00100000
	subeq	r5, r5, #(1 << 20)
	beq	3b
	orr	yh, yh, r6
	b	.ml_x

	@ One or both args are INF or NAN.
.ml_s:	orrs	r6, xl, xh, lsl #1
	orrnes	r6, yl, yh, lsl #1
	beq	.ml_n			@ 0 * INF or INF * 0 -> NAN
	teq	r4, ip
	bne	1f
	orrs	r6, xl, xh, lsl #12
	bne	.ml_n			@ NAN * <anything> -> NAN
1:	teq	r5, ip
	bne	.ml_i
	orrs	r6, yl, yh, lsl #12
	bne	.ml_n			@ <anything> * NAN -> NAN

	@ Result is INF, but we need to determine its sign.
.ml_i:	eor	xh, xh, yh

	@ Overflow: return INF (sign already in xh).
.ml_o:	and	xh, xh, #0x80000000
	orr	xh, xh, #0x7f000000
	orr	xh, xh, #0x00f00000
	mov	xl, #0
	ldmfd	sp!, {r4, r5, r6, pc}

	@ Return NAN.
.ml_n:	mov	xh, #0x7f000000
	orr	xh, xh, #0x00f80000
	ldmfd	sp!, {r4, r5, r6, pc}


__divdf3:

	stmfd	sp!, {r4, r5, r6, lr}

	@ Mask out exponents.
	mov	ip, #0x7f000000
	orr	ip, ip, #0x00f00000
	and	r4, xh, ip
	and	r5, yh, ip

	@ Trap any INF/NAN or zeroes.
	teq	r4, ip
	teqne	r5, ip
	orrnes	r6, xl, xh, lsl #1
	orrnes	r6, yl, yh, lsl #1
	beq	.dv_s

	@ Shift exponents right one bit to make room for overflow bit.
	@ If either of them is 0, scale denormalized arguments off line.
	@ Then substract divisor exponent from dividend''s.
	movs	r4, r4, lsr #1
	teqne	r5, #0
	beq	.dv_d
.dv_x:	sub	r4, r4, r5, asr #1

	@ Preserve final sign into ip.
	eor	ip, xh, yh

	@ Convert mantissa to unsigned integer.
	@ Dividend -> r5-r6, divisor -> yh-yl.
	mov	r5, #0x10000000
	mov	yh, yh, lsl #12
	orr	yh, r5, yh, lsr #4
	orr	yh, yh, yl, lsr #24
	movs	yl, yl, lsl #8
	mov	xh, xh, lsl #12
	teqeq	yh, r5
	beq	.dv_1
	orr	r5, r5, xh, lsr #4
	orr	r5, r5, xl, lsr #24
	mov	r6, xl, lsl #8

	@ Initialize xh with final sign bit.
	and	xh, ip, #0x80000000

	@ Ensure result will land to known bit position.
	cmp	r5, yh
	cmpeq	r6, yl
	bcs	1f
	sub	r4, r4, #(1 << 19)
	movs	yh, yh, lsr #1
	mov	yl, yl, rrx
1:
	@ Apply exponent bias, check range for over/underflow.
	mov	ip, #0x1f000000
	orr	ip, ip, #0x00f80000
	add	r4, r4, ip
	cmn	r4, #(53 << 19)
	ble	.dv_z
	orr	ip, ip, #0x20000000
	cmp	r4, ip
	bge	.ml_o

	@ Perform first substraction to align result to a nibble.
	subs	r6, r6, yl
	sbc	r5, r5, yh
	movs	yh, yh, lsr #1
	mov	yl, yl, rrx
	mov	xl, #0x00100000
	mov	ip, #0x00080000

	@ The actual division loop.
1:	cmp	r5, yh
	cmpeq	r6, yl
	bcc	2f
	subs	r6, r6, yl
	sbc	r5, r5, yh
	orr	xl, xl, ip
2:	movs	yh, yh, lsr #1
	mov	yl, yl, rrx
	cmp	r5, yh
	cmpeq	r6, yl
	bcc	3f
	subs	r6, r6, yl
	sbc	r5, r5, yh
	orr	xl, xl, ip, lsr #1
3:	movs	yh, yh, lsr #1
	mov	yl, yl, rrx
	cmp	r5, yh
	cmpeq	r6, yl
	bcc	4f
	subs	r6, r6, yl
	sbc	r5, r5, yh
	orr	xl, xl, ip, lsr #2
4:	movs	yh, yh, lsr #1
	mov	yl, yl, rrx
	cmp	r5, yh
	cmpeq	r6, yl
	bcc	5f
	subs	r6, r6, yl
	sbc	r5, r5, yh
	orr	xl, xl, ip, lsr #3
5:	orrs	lr, r5, r6
	beq	6f
	mov	r5, r5, lsl #4
	orr	r5, r5, r6, lsr #28
	mov	r6, r6, lsl #4
	mov	yh, yh, lsl #3
	orr	yh, yh, yl, lsr #29
	mov	yl, yl, lsl #3
	movs	ip, ip, lsr #4
	bne	1b

	@ We are done with a word of the result.
	@ Loop again for the low word if this pass was for the high word.
	tst	xh, #0x00100000
	bne	7f
	orr	xh, xh, xl
	mov	xl, #0
	mov	ip, #0x80000000
	b	1b
6:
	@ Be sure result starts in the high word.
	tst	xh, #0x00100000
	orreq	xh, xh, xl
	moveq	xl, #0
7:
	@ Check if denormalized result is needed.
	cmp	r4, #0
	ble	.dv_u

	@ Apply proper rounding.
	subs	ip, r5, yh
	subeqs	ip, r6, yl
	bcc	1f
	adds	xl, xl, #1
	adc	xh, xh, #0
	teq	ip, #0
	biceq	xl, xl, #1
1:
	@ Add exponent to result.
	bic	xh, xh, #0x00100000
	orr	xh, xh, r4, lsl #1
	ldmfd	sp!, {r4, r5, r6, pc}

.dv_1:	@ Division by 0x1p*: shortcut a lot of code.
	and	ip, ip, #0x80000000
	orr	xh, ip, xh, lsr #12
	mov	ip, #0x1f000000
	orr	ip, ip, #0x00f80000
	add	r4, r4, ip
	orr	ip, ip, #0x20000000
	cmp	r4, ip
	bge	.ml_o
	cmp	r4, #0
	orrgt	xh, xh, r4, lsl #1
	ldmgtfd	sp!, {r4, r5, r6, pc}
	cmn	r4, #(53 << 19)
	ble	.dv_z
	orr	xh, xh, #0x00100000
	mov	lr, #0
	b	.ml_r

	@ Result must be denormalized: put remainder in lr for
	@ rounding considerations.
.dv_u:	orr	lr, r5, r6
	b	.ml_r

	@ One or both arguments are denormalized.
	@ Scale them leftwards and preserve sign bit.
.dv_d:	mov	lr, #0
	teq	r4, #0
	bne	2f
	and	r6, xh, #0x80000000
1:	movs	xl, xl, lsl #1
	adc	xh, lr, xh, lsl #1
	tst	xh, #0x00100000
	subeq	r4, r4, #(1 << 19)
	beq	1b
	orr	xh, xh, r6
2:	teq	r5, #0
	bne	.dv_x
	and	r6, yh, #0x80000000
3:	movs	yl, yl, lsl #1
	adc	yh, lr, yh, lsl #1
	tst	yh, #0x00100000
	subeq	r5, r5, #(1 << 20)
	beq	3b
	orr	yh, yh, r6
	b	.dv_x

	@ One or both arguments is either INF, NAN or zero.
.dv_s:	mov	ip, #0x7f000000
	orr	ip, ip, #0x00f00000
	teq	r4, ip
	teqeq	r5, ip
	beq	.ml_n			@ INF/NAN / INF/NAN -> NAN
	teq	r4, ip
	bne	1f
	orrs	r4, xl, xh, lsl #12
	bne	.ml_n			@ NAN / <anything> -> NAN
	b	.ml_i			@ INF / <anything> -> INF
1:	teq	r5, ip
	bne	2f
	orrs	r5, yl, yh, lsl #12
	bne	.ml_n			@ <anything> / NAN -> NAN
	b	.ml_z			@ <anything> / INF -> 0
2:	@ One or both arguments are 0.
	orrs	r4, xl, xh, lsl #1
	bne	.ml_i			@ <non_zero> / 0 -> INF
	orrs	r5, yl, yh, lsl #1
	bne	.ml_z			@ 0 / <non_zero> -> 0
	b	.ml_n			@ 0 / 0 -> NAN


__gedf2:
__gtdf2:
	mov	ip, #-1
	b	1f

__ledf2:
__ltdf2:
	mov	ip, #1
	b	1f

__nedf2:
__eqdf2:
__cmpdf2:
	mov	ip, #1			@ how should we specify unordered here?

1:	stmfd	sp!, {r4, r5, lr}

	@ Trap any INF/NAN first.
	mov	lr, #0x7f000000
	orr	lr, lr, #0x00f00000
	and	r4, xh, lr
	and	r5, yh, lr
	teq	r4, lr
	teqne	r5, lr
	beq	3f

	@ Test for equality.
	@ Note that 0.0 is equal to -0.0.
2:	orrs	ip, xl, xh, lsl #1	@ if x == 0.0 or -0.0
	orreqs	ip, yl, yh, lsl #1	@ and y == 0.0 or -0.0
	teqne	xh, yh			@ or xh == yh
	teqeq	xl, yl			@ and xl == yl
	moveq	r0, #0			@ then equal.
	ldmeqfd	sp!, {r4, r5, pc}

	@ Check for sign difference.
	teq	xh, yh
	movmi	r0, xh, asr #31
	orrmi	r0, r0, #1
	ldmmifd	sp!, {r4, r5, pc}

	@ Compare exponents.
	cmp	r4, r5

	@ Compare mantissa if exponents are equal.
	moveq	xh, xh, lsl #12
	cmpeq	xh, yh, lsl #12
	cmpeq	xl, yl
	movcs	r0, yh, asr #31
	mvncc	r0, yh, asr #31
	orr	r0, r0, #1
	ldmfd	sp!, {r4, r5, pc}

	@ Look for a NAN.
3:	teq	r4, lr
	bne	4f
	orrs	xl, xl, xh, lsl #12
	bne	5f			@ x is NAN
4:	teq	r5, lr
	bne	2b
	orrs	yl, yl, yh, lsl #12
	beq	2b			@ y is not NAN
5:	mov	r0, ip			@ return unordered code from ip
	ldmfd	sp!, {r4, r5, pc}


__unorddf2:
	str	lr, [sp, #-4]!
	mov	ip, #0x7f000000
	orr	ip, ip, #0x00f00000
	and	lr, xh, ip
	teq	lr, ip
	bne	1f
	orrs	xl, xl, xh, lsl #12
	bne	3f			@ x is NAN
1:	and	lr, yh, ip
	teq	lr, ip
	bne	2f
	orrs	yl, yl, yh, lsl #12
	bne	3f			@ y is NAN
2:	mov	r0, #0			@ arguments are ordered.
	ldr	pc, [sp], #4
3:	mov	r0, #1			@ arguments are unordered.
	ldr	pc, [sp], #4


__fixdfsi:
	orrs	ip, xl, xh, lsl #1
	beq	1f			@ value is 0.
	mrs	r3, cpsr		@ preserve C flag (the actual sign)

	@ check exponent range.
	mov	ip, #0x7f000000
	orr	ip, ip, #0x00f00000
	and	r2, xh, ip
	teq	r2, ip
	beq	2f			@ value is INF or NAN
	bic	ip, ip, #0x40000000
	cmp	r2, ip
	bcc	1f			@ value is too small
	add	ip, ip, #(31 << 20)
	cmp	r2, ip
	bcs	3f			@ value is too large

	rsb	r2, r2, ip
	mov	ip, xh, lsl #11
	orr	ip, ip, #0x80000000
	orr	ip, ip, xl, lsr #21
	mov	r2, r2, lsr #20
	mov	r0, ip, lsr r2
	tst	r3, #0x20000000		@ the sign bit
	rsbne	r0, r0, #0
	mov	pc, lr

1:	mov	r0, #0
	mov	pc, lr

2:	orrs	xl, xl, xh, lsl #12
	bne	4f			@ r0 is NAN.
3:	tst	r3, #0x20000000		@ the sign bit
	moveq	r0, #0x7fffffff		@ maximum signed positive si
	movne	r0, #0x80000000		@ maximum signed negative si
	mov	pc, lr

4:	mov	r0, #0			@ How should we convert NAN?
	mov	pc, lr

__fixunsdfsi:
	orrs	ip, xl, xh, lsl #1
	beq	1b			@ value is 0
	bcs	1b			@ value is negative

	@ check exponent range.
	mov	ip, #0x7f000000
	orr	ip, ip, #0x00f00000
	and	r2, xh, ip
	teq	r2, ip
	beq	1f			@ value is INF or NAN
	bic	ip, ip, #0x40000000
	cmp	r2, ip
	bcc	1b			@ value is too small
	add	ip, ip, #(31 << 20)
	cmp	r2, ip
	bhi	2f			@ value is too large

	rsb	r2, r2, ip
	mov	ip, xh, lsl #11
	orr	ip, ip, #0x80000000
	orr	ip, ip, xl, lsr #21
	mov	r2, r2, lsr #20
	mov	r0, ip, lsr r2
	mov	pc, lr

1:	orrs	xl, xl, xh, lsl #12
	bne	4b			@ value is NAN.
2:	mov	r0, #0xffffffff		@ maximum unsigned si
	mov	pc, lr


__truncdfsf2:
	orrs	r2, xl, xh, lsl #1
	moveq	r0, r2, rrx
	moveq	pc, lr			@ value is 0.0 or -0.0
	
	@ check exponent range.
	mov	ip, #0x7f000000
	orr	ip, ip, #0x00f00000
	and	r2, ip, xh
	teq	r2, ip
	beq	2f			@ value is INF or NAN
	bic	xh, xh, ip
	cmp	r2, #(0x380 << 20)
	bls	4f			@ value is too small

	@ shift and round mantissa
1:	movs	r3, xl, lsr #29
	adc	r3, r3, xh, lsl #3

	@ if halfway between two numbers, round towards LSB = 0.
	mov	xl, xl, lsl #3
	teq	xl, #0x80000000
	biceq	r3, r3, #1

	@ rounding might have created an extra MSB.  If so adjust exponent.
	tst	r3, #0x00800000
	addne	r2, r2, #(1 << 20)
	bicne	r3, r3, #0x00800000

	@ check exponent for overflow
	mov	ip, #(0x400 << 20)
	orr	ip, ip, #(0x07f << 20)
	cmp	r2, ip
	bcs	3f			@ overflow

	@ adjust exponent, merge with sign bit and mantissa.
	movs	xh, xh, lsl #1
	mov	r2, r2, lsl #4
	orr	r0, r3, r2, rrx
	eor	r0, r0, #0x40000000
	mov	pc, lr

2:	@ chech for NAN
	orrs	xl, xl, xh, lsl #12
	movne	r0, #0x7f000000
	orrne	r0, r0, #0x00c00000
	movne	pc, lr			@ return NAN

3:	@ return INF with sign
	and	r0, xh, #0x80000000
	orr	r0, r0, #0x7f000000
	orr	r0, r0, #0x00800000
	mov	pc, lr

4:	@ check if denormalized value is possible
	subs	r2, r2, #((0x380 - 24) << 20)
	andle	r0, xh, #0x80000000	@ too small, return signed 0.
	movle	pc, lr
	
	@ denormalize value so we can resume with the code above afterwards.
	orr	xh, xh, #0x00100000
	mov	r2, r2, lsr #20
	rsb	r2, r2, #25
	cmp	r2, #20
	bgt	6f

	rsb	ip, r2, #32
	mov	r3, xl, lsl ip
	mov	xl, xl, lsr r2
	orr	xl, xl, xh, lsl ip
	movs	xh, xh, lsl #1
	mov	xh, xh, lsr r2
	mov	xh, xh, rrx
5:	teq	r3, #0			@ fold r3 bits into the LSB
	orrne	xl, xl, #1		@ for rounding considerations. 
	mov	r2, #(0x380 << 20)	@ equivalent to the 0 float exponent
	b	1b

6:	rsb	r2, r2, #(12 + 20)
	rsb	ip, r2, #32
	mov	r3, xl, lsl r2
	mov	xl, xl, lsr ip
	orr	xl, xl, xh, lsl r2
	and	xh, xh, #0x80000000
	b	5b


