diff -r 372d3611c693 -r e89a7cf22da3 SConstruct --- a/SConstruct Thu Sep 19 17:55:04 2013 +0200 +++ b/SConstruct Thu Sep 19 18:09:15 2013 +0200 @@ -545,6 +545,9 @@ # Add selected sanity checks from -Wextra main.Append(CXXFLAGS=['-Wmissing-field-initializers', '-Woverloaded-virtual']) + # Some standard library versions require constant macros to be + # enabled explicitly when using C++. + main.Append(CCFLAGS=['-D__STDC_CONSTANT_MACROS']) else: print termcap.Yellow + termcap.Bold + 'Error' + termcap.Normal, print "Don't know what compiler options to use for your compiler." @@ -1109,6 +1112,10 @@ main.SConscript('ext/libfdt/SConscript', variant_dir = joinpath(build_root, 'libfdt')) +# softfloat build is shared across all configs in the build root. +main.SConscript('ext/softfloat/SConscript', + variant_dir = joinpath(build_root, 'softfloat')) + ################################################### # # This function is used to set up a directory with switching headers diff -r 372d3611c693 -r e89a7cf22da3 ext/softfloat/SConscript --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ext/softfloat/SConscript Thu Sep 19 18:09:15 2013 +0200 @@ -0,0 +1,47 @@ +# -*- mode:python -*- + +# Copyright (c) 2013 Andreas Sandberg +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# Authors: Andreas Sandberg + +Import('main') + +softfloat_files = [ + 'softfloat.cc', + 'softfloatx80.cc', + 'softfloat16.cc', + 'softfloat-specialize.cc', + 'softfloat-round-pack.cc', + 'softfloat-muladd.cc', +] + +main.Library('softfloat', [main.SharedObject(f) for f in softfloat_files]) + +main.Prepend(CPPPATH=Dir('.')) +main.Append(LIBS=['softfloat']) +main.Prepend(LIBPATH=[Dir('.')]) + diff -r 372d3611c693 -r e89a7cf22da3 ext/softfloat/softfloat-compare.hh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ext/softfloat/softfloat-compare.hh Thu Sep 19 18:09:15 2013 +0200 @@ -0,0 +1,500 @@ +/*============================================================================ +This C header file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic +Package, Release 2b. + +Written by John R. Hauser. This work was made possible in part by the +International Computer Science Institute, located at Suite 600, 1947 Center +Street, Berkeley, California 94704. Funding was partially provided by the +National Science Foundation under grant MIP-9311980. The original version +of this code was written as part of a project to build a fixed-point vector +processor in collaboration with the University of California at Berkeley, +overseen by Profs. Nelson Morgan and John Wawrzynek. More information +is available through the Web page `http://www.cs.berkeley.edu/~jhauser/ +arithmetic/SoftFloat.html'. + +THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has +been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES +RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS +AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES, +COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE +EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE +INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR +OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE. + +Derivative works are acceptable, even for commercial purposes, so long as +(1) the source code for the derivative work includes prominent notice that +the work is derivative, and (2) the source code includes prominent notice with +these four paragraphs for those parts of this code that are retained. +=============================================================================*/ + +/*============================================================================ + * Adapted for Bochs (x86 achitecture simulator) by + * Stanislav Shwartsman [sshwarts at sourceforge net] + * ==========================================================================*/ + +#ifndef _SOFTFLOAT_COMPARE_H_ +#define _SOFTFLOAT_COMPARE_H_ + +#include "softfloat.hh" + +BEGIN_SOFTFLOAT_NS + +// ======= float32 ======= // + +typedef int (*float32_compare_method)(float32, float32, float_status_t &status); + +// 0x00 +inline int float32_eq_ordered_quiet(float32 a, float32 b, float_status_t &status) +{ + int relation = float32_compare_quiet(a, b, status); + return (relation == float_relation_equal); +} + +// 0x01 +inline int float32_lt_ordered_signalling(float32 a, float32 b, float_status_t &status) +{ + int relation = float32_compare(a, b, status); + return (relation == float_relation_less); +} + +// 0x02 +inline int float32_le_ordered_signalling(float32 a, float32 b, float_status_t &status) +{ + int relation = float32_compare(a, b, status); + return (relation == float_relation_less) || (relation == float_relation_equal); +} + +// 0x03 +inline int float32_unordered_quiet(float32 a, float32 b, float_status_t &status) +{ + int relation = float32_compare_quiet(a, b, status); + return (relation == float_relation_unordered); +} + +// 0x04 +inline int float32_neq_unordered_quiet(float32 a, float32 b, float_status_t &status) +{ + int relation = float32_compare_quiet(a, b, status); + return (relation != float_relation_equal); +} + +// 0x05 +inline int float32_nlt_unordered_signalling(float32 a, float32 b, float_status_t &status) +{ + int relation = float32_compare(a, b, status); + return (relation != float_relation_less); +} + +// 0x06 +inline int float32_nle_unordered_signalling(float32 a, float32 b, float_status_t &status) +{ + int relation = float32_compare(a, b, status); + return (relation != float_relation_less) && (relation != float_relation_equal); +} + +// 0x07 +inline int float32_ordered_quiet(float32 a, float32 b, float_status_t &status) +{ + int relation = float32_compare_quiet(a, b, status); + return (relation != float_relation_unordered); +} + +// 0x08 +inline int float32_eq_unordered_quiet(float32 a, float32 b, float_status_t &status) +{ + int relation = float32_compare_quiet(a, b, status); + return (relation == float_relation_equal) || (relation == float_relation_unordered); +} + +// 0x09 +inline int float32_nge_unordered_signalling(float32 a, float32 b, float_status_t &status) +{ + int relation = float32_compare(a, b, status); + return (relation == float_relation_less) || (relation == float_relation_unordered); +} + +// 0x0a +inline int float32_ngt_unordered_signalling(float32 a, float32 b, float_status_t &status) +{ + int relation = float32_compare(a, b, status); + return (relation != float_relation_greater); +} + +// 0x0b +inline int float32_false_quiet(float32 a, float32 b, float_status_t &status) +{ + float32_compare_quiet(a, b, status); + return 0; +} + +// 0x0c +inline int float32_neq_ordered_quiet(float32 a, float32 b, float_status_t &status) +{ + int relation = float32_compare_quiet(a, b, status); + return (relation != float_relation_equal) && (relation != float_relation_unordered); +} + +// 0x0d +inline int float32_ge_ordered_signalling(float32 a, float32 b, float_status_t &status) +{ + int relation = float32_compare(a, b, status); + return (relation == float_relation_greater) || (relation == float_relation_equal); +} + +// 0x0e +inline int float32_gt_ordered_signalling(float32 a, float32 b, float_status_t &status) +{ + int relation = float32_compare(a, b, status); + return (relation == float_relation_greater); +} + +// 0x0f +inline int float32_true_quiet(float32 a, float32 b, float_status_t &status) +{ + float32_compare_quiet(a, b, status); + return 1; +} + +// 0x10 +inline int float32_eq_ordered_signalling(float32 a, float32 b, float_status_t &status) +{ + int relation = float32_compare(a, b, status); + return (relation == float_relation_equal); +} + +// 0x11 +inline int float32_lt_ordered_quiet(float32 a, float32 b, float_status_t &status) +{ + int relation = float32_compare_quiet(a, b, status); + return (relation == float_relation_less); +} + +// 0x12 +inline int float32_le_ordered_quiet(float32 a, float32 b, float_status_t &status) +{ + int relation = float32_compare_quiet(a, b, status); + return (relation == float_relation_less) || (relation == float_relation_equal); +} + +// 0x13 +inline int float32_unordered_signalling(float32 a, float32 b, float_status_t &status) +{ + int relation = float32_compare(a, b, status); + return (relation == float_relation_unordered); +} + +// 0x14 +inline int float32_neq_unordered_signalling(float32 a, float32 b, float_status_t &status) +{ + int relation = float32_compare(a, b, status); + return (relation != float_relation_equal); +} + +// 0x15 +inline int float32_nlt_unordered_quiet(float32 a, float32 b, float_status_t &status) +{ + int relation = float32_compare_quiet(a, b, status); + return (relation != float_relation_less); +} + +// 0x16 +inline int float32_nle_unordered_quiet(float32 a, float32 b, float_status_t &status) +{ + int relation = float32_compare_quiet(a, b, status); + return (relation != float_relation_less) && (relation != float_relation_equal); +} + +// 0x17 +inline int float32_ordered_signalling(float32 a, float32 b, float_status_t &status) +{ + int relation = float32_compare(a, b, status); + return (relation != float_relation_unordered); +} + +// 0x18 +inline int float32_eq_unordered_signalling(float32 a, float32 b, float_status_t &status) +{ + int relation = float32_compare(a, b, status); + return (relation == float_relation_equal) || (relation == float_relation_unordered); +} + +// 0x19 +inline int float32_nge_unordered_quiet(float32 a, float32 b, float_status_t &status) +{ + int relation = float32_compare_quiet(a, b, status); + return (relation == float_relation_less) || (relation == float_relation_unordered); +} + +// 0x1a +inline int float32_ngt_unordered_quiet(float32 a, float32 b, float_status_t &status) +{ + int relation = float32_compare_quiet(a, b, status); + return (relation != float_relation_greater); +} + +// 0x1b +inline int float32_false_signalling(float32 a, float32 b, float_status_t &status) +{ + float32_compare(a, b, status); + return 0; +} + +// 0x1c +inline int float32_neq_ordered_signalling(float32 a, float32 b, float_status_t &status) +{ + int relation = float32_compare(a, b, status); + return (relation != float_relation_equal) && (relation != float_relation_unordered); +} + +// 0x1d +inline int float32_ge_ordered_quiet(float32 a, float32 b, float_status_t &status) +{ + int relation = float32_compare_quiet(a, b, status); + return (relation == float_relation_greater) || (relation == float_relation_equal); +} + +// 0x1e +inline int float32_gt_ordered_quiet(float32 a, float32 b, float_status_t &status) +{ + int relation = float32_compare_quiet(a, b, status); + return (relation == float_relation_greater); +} + +// 0x1f +inline int float32_true_signalling(float32 a, float32 b, float_status_t &status) +{ + float32_compare(a, b, status); + return 1; +} + +// ======= float64 ======= // + +typedef int (*float64_compare_method)(float64, float64, float_status_t &status); + +// 0x00 +inline int float64_eq_ordered_quiet(float64 a, float64 b, float_status_t &status) +{ + int relation = float64_compare_quiet(a, b, status); + return (relation == float_relation_equal); +} + +// 0x01 +inline int float64_lt_ordered_signalling(float64 a, float64 b, float_status_t &status) +{ + int relation = float64_compare(a, b, status); + return (relation == float_relation_less); +} + +// 0x02 +inline int float64_le_ordered_signalling(float64 a, float64 b, float_status_t &status) +{ + int relation = float64_compare(a, b, status); + return (relation == float_relation_less) || (relation == float_relation_equal); +} + +// 0x03 +inline int float64_unordered_quiet(float64 a, float64 b, float_status_t &status) +{ + int relation = float64_compare_quiet(a, b, status); + return (relation == float_relation_unordered); +} + +// 0x04 +inline int float64_neq_unordered_quiet(float64 a, float64 b, float_status_t &status) +{ + int relation = float64_compare_quiet(a, b, status); + return (relation != float_relation_equal); +} + +// 0x05 +inline int float64_nlt_unordered_signalling(float64 a, float64 b, float_status_t &status) +{ + int relation = float64_compare(a, b, status); + return (relation != float_relation_less); +} + +// 0x06 +inline int float64_nle_unordered_signalling(float64 a, float64 b, float_status_t &status) +{ + int relation = float64_compare(a, b, status); + return (relation != float_relation_less) && (relation != float_relation_equal); +} + +// 0x07 +inline int float64_ordered_quiet(float64 a, float64 b, float_status_t &status) +{ + int relation = float64_compare_quiet(a, b, status); + return (relation != float_relation_unordered); +} + +// 0x08 +inline int float64_eq_unordered_quiet(float64 a, float64 b, float_status_t &status) +{ + int relation = float64_compare_quiet(a, b, status); + return (relation == float_relation_equal) || (relation == float_relation_unordered); +} + +// 0x09 +inline int float64_nge_unordered_signalling(float64 a, float64 b, float_status_t &status) +{ + int relation = float64_compare(a, b, status); + return (relation == float_relation_less) || (relation == float_relation_unordered); +} + +// 0x0a +inline int float64_ngt_unordered_signalling(float64 a, float64 b, float_status_t &status) +{ + int relation = float64_compare(a, b, status); + return (relation != float_relation_greater); +} + +// 0x0b +inline int float64_false_quiet(float64 a, float64 b, float_status_t &status) +{ + float64_compare_quiet(a, b, status); + return 0; +} + +// 0x0c +inline int float64_neq_ordered_quiet(float64 a, float64 b, float_status_t &status) +{ + int relation = float64_compare_quiet(a, b, status); + return (relation != float_relation_equal) && (relation != float_relation_unordered); +} + +// 0x0d +inline int float64_ge_ordered_signalling(float64 a, float64 b, float_status_t &status) +{ + int relation = float64_compare(a, b, status); + return (relation == float_relation_greater) || (relation == float_relation_equal); +} + +// 0x0e +inline int float64_gt_ordered_signalling(float64 a, float64 b, float_status_t &status) +{ + int relation = float64_compare(a, b, status); + return (relation == float_relation_greater); +} + +// 0x0f +inline int float64_true_quiet(float64 a, float64 b, float_status_t &status) +{ + float64_compare_quiet(a, b, status); + return 1; +} + +// 0x10 +inline int float64_eq_ordered_signalling(float64 a, float64 b, float_status_t &status) +{ + int relation = float64_compare(a, b, status); + return (relation == float_relation_equal); +} + +// 0x11 +inline int float64_lt_ordered_quiet(float64 a, float64 b, float_status_t &status) +{ + int relation = float64_compare_quiet(a, b, status); + return (relation == float_relation_less); +} + +// 0x12 +inline int float64_le_ordered_quiet(float64 a, float64 b, float_status_t &status) +{ + int relation = float64_compare_quiet(a, b, status); + return (relation == float_relation_less) || (relation == float_relation_equal); +} + +// 0x13 +inline int float64_unordered_signalling(float64 a, float64 b, float_status_t &status) +{ + int relation = float64_compare(a, b, status); + return (relation == float_relation_unordered); +} + +// 0x14 +inline int float64_neq_unordered_signalling(float64 a, float64 b, float_status_t &status) +{ + int relation = float64_compare(a, b, status); + return (relation != float_relation_equal); +} + +// 0x15 +inline int float64_nlt_unordered_quiet(float64 a, float64 b, float_status_t &status) +{ + int relation = float64_compare_quiet(a, b, status); + return (relation != float_relation_less); +} + +// 0x16 +inline int float64_nle_unordered_quiet(float64 a, float64 b, float_status_t &status) +{ + int relation = float64_compare_quiet(a, b, status); + return (relation != float_relation_less) && (relation != float_relation_equal); +} + +// 0x17 +inline int float64_ordered_signalling(float64 a, float64 b, float_status_t &status) +{ + int relation = float64_compare(a, b, status); + return (relation != float_relation_unordered); +} + +// 0x18 +inline int float64_eq_unordered_signalling(float64 a, float64 b, float_status_t &status) +{ + int relation = float64_compare(a, b, status); + return (relation == float_relation_equal) || (relation == float_relation_unordered); +} + +// 0x19 +inline int float64_nge_unordered_quiet(float64 a, float64 b, float_status_t &status) +{ + int relation = float64_compare_quiet(a, b, status); + return (relation == float_relation_less) || (relation == float_relation_unordered); +} + +// 0x1a +inline int float64_ngt_unordered_quiet(float64 a, float64 b, float_status_t &status) +{ + int relation = float64_compare_quiet(a, b, status); + return (relation != float_relation_greater); +} + +// 0x1b +inline int float64_false_signalling(float64 a, float64 b, float_status_t &status) +{ + float64_compare(a, b, status); + return 0; +} + +// 0x1c +inline int float64_neq_ordered_signalling(float64 a, float64 b, float_status_t &status) +{ + int relation = float64_compare(a, b, status); + return (relation != float_relation_equal) && (relation != float_relation_unordered); +} + +// 0x1d +inline int float64_ge_ordered_quiet(float64 a, float64 b, float_status_t &status) +{ + int relation = float64_compare_quiet(a, b, status); + return (relation == float_relation_greater) || (relation == float_relation_equal); +} + +// 0x1e +inline int float64_gt_ordered_quiet(float64 a, float64 b, float_status_t &status) +{ + int relation = float64_compare_quiet(a, b, status); + return (relation == float_relation_greater); +} + +// 0x1f +inline int float64_true_signalling(float64 a, float64 b, float_status_t &status) +{ + float64_compare(a, b, status); + return 1; +} + +END_SOFTFLOAT_NS + +#endif diff -r 372d3611c693 -r e89a7cf22da3 ext/softfloat/softfloat-macros.hh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ext/softfloat/softfloat-macros.hh Thu Sep 19 18:09:15 2013 +0200 @@ -0,0 +1,690 @@ +/*============================================================================ +This C source fragment is part of the SoftFloat IEC/IEEE Floating-point +Arithmetic Package, Release 2b. + +Written by John R. Hauser. This work was made possible in part by the +International Computer Science Institute, located at Suite 600, 1947 Center +Street, Berkeley, California 94704. Funding was partially provided by the +National Science Foundation under grant MIP-9311980. The original version +of this code was written as part of a project to build a fixed-point vector +processor in collaboration with the University of California at Berkeley, +overseen by Profs. Nelson Morgan and John Wawrzynek. More information +is available through the Web page `http://www.cs.berkeley.edu/~jhauser/ +arithmetic/SoftFloat.html'. + +THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has +been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES +RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS +AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES, +COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE +EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE +INSTITUTE (possibly via similar legal notice) AGAINST ALL LOSSES, COSTS, OR +OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE. + +Derivative works are acceptable, even for commercial purposes, so long as +(1) the source code for the derivative work includes prominent notice that +the work is derivative, and (2) the source code includes prominent notice with +these four paragraphs for those parts of this code that are retained. +=============================================================================*/ + +/*============================================================================ + * Adapted for Bochs (x86 achitecture simulator) by + * Stanislav Shwartsman [sshwarts at sourceforge net] + * ==========================================================================*/ + +#ifndef _SOFTFLOAT_MACROS_H_ +#define _SOFTFLOAT_MACROS_H_ + +BEGIN_SOFTFLOAT_NS + +/*---------------------------------------------------------------------------- +| Shifts `a' right by the number of bits given in `count'. If any nonzero +| bits are shifted off, they are ``jammed'' into the least significant bit of +| the result by setting the least significant bit to 1. The value of `count' +| can be arbitrarily large; in particular, if `count' is greater than 16, the +| result will be either 0 or 1, depending on whether `a' is zero or nonzero. +*----------------------------------------------------------------------------*/ + +inline uint16_t shift16RightJamming(uint16_t a, int count) +{ + uint16_t z; + + if (count == 0) { + z = a; + } + else if (count < 16) { + z = (a>>count) | ((a<<((-count) & 15)) != 0); + } + else { + z = (a != 0); + } + + return z; +} + +/*---------------------------------------------------------------------------- +| Shifts `a' right by the number of bits given in `count'. If any nonzero +| bits are shifted off, they are ``jammed'' into the least significant bit of +| the result by setting the least significant bit to 1. The value of `count' +| can be arbitrarily large; in particular, if `count' is greater than 32, the +| result will be either 0 or 1, depending on whether `a' is zero or nonzero. +*----------------------------------------------------------------------------*/ + +inline uint32_t shift32RightJamming(uint32_t a, int count) +{ + uint32_t z; + + if (count == 0) { + z = a; + } + else if (count < 32) { + z = (a>>count) | ((a<<((-count) & 31)) != 0); + } + else { + z = (a != 0); + } + + return z; +} + +/*---------------------------------------------------------------------------- +| Shifts `a' right by the number of bits given in `count'. If any nonzero +| bits are shifted off, they are ``jammed'' into the least significant bit of +| the result by setting the least significant bit to 1. The value of `count' +| can be arbitrarily large; in particular, if `count' is greater than 64, the +| result will be either 0 or 1, depending on whether `a' is zero or nonzero. +*----------------------------------------------------------------------------*/ + +inline uint64_t shift64RightJamming(uint64_t a, int count) +{ + uint64_t z; + + if (count == 0) { + z = a; + } + else if (count < 64) { + z = (a>>count) | ((a << ((-count) & 63)) != 0); + } + else { + z = (a != 0); + } + + return z; +} + +/*---------------------------------------------------------------------------- +| Shifts the 128-bit value formed by concatenating `a0' and `a1' right by 64 +| _plus_ the number of bits given in `count'. The shifted result is at most +| 64 nonzero bits; this is stored at the location pointed to by `z0Ptr'. The +| bits shifted off form a second 64-bit result as follows: The _last_ bit +| shifted off is the most-significant bit of the extra result, and the other +| 63 bits of the extra result are all zero if and only if _all_but_the_last_ +| bits shifted off were all zero. This extra result is stored in the location +| pointed to by `z1Ptr'. The value of `count' can be arbitrarily large. +| (This routine makes more sense if `a0' and `a1' are considered to form +| a fixed-point value with binary point between `a0' and `a1'. This fixed- +| point value is shifted right by the number of bits given in `count', and +| the integer part of the result is returned at the location pointed to by +| `z0Ptr'. The fractional part of the result may be slightly corrupted as +| described above, and is returned at the location pointed to by `z1Ptr'.) +*----------------------------------------------------------------------------*/ + +inline void shift64ExtraRightJamming(uint64_t a0, uint64_t a1, int count, uint64_t *z0Ptr, uint64_t *z1Ptr) +{ + uint64_t z0, z1; + int negCount = (-count) & 63; + + if (count == 0) { + z1 = a1; + z0 = a0; + } + else if (count < 64) { + z1 = (a0<>count; + } + else { + if (count == 64) { + z1 = a0 | (a1 != 0); + } + else { + z1 = ((a0 | a1) != 0); + } + z0 = 0; + } + *z1Ptr = z1; + *z0Ptr = z0; +} + +/*---------------------------------------------------------------------------- +| Adds the 128-bit value formed by concatenating `a0' and `a1' to the 128-bit +| value formed by concatenating `b0' and `b1'. Addition is modulo 2^128, so +| any carry out is lost. The result is broken into two 64-bit pieces which +| are stored at the locations pointed to by `z0Ptr' and `z1Ptr'. +*----------------------------------------------------------------------------*/ + +inline void add128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1, uint64_t *z0Ptr, uint64_t *z1Ptr) +{ + uint64_t z1 = a1 + b1; + *z1Ptr = z1; + *z0Ptr = a0 + b0 + (z1 < a1); +} + +/*---------------------------------------------------------------------------- +| Subtracts the 128-bit value formed by concatenating `b0' and `b1' from the +| 128-bit value formed by concatenating `a0' and `a1'. Subtraction is modulo +| 2^128, so any borrow out (carry out) is lost. The result is broken into two +| 64-bit pieces which are stored at the locations pointed to by `z0Ptr' and +| `z1Ptr'. +*----------------------------------------------------------------------------*/ + +inline void + sub128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1, uint64_t *z0Ptr, uint64_t *z1Ptr) +{ + *z1Ptr = a1 - b1; + *z0Ptr = a0 - b0 - (a1 < b1); +} + +/*---------------------------------------------------------------------------- +| Multiplies `a' by `b' to obtain a 128-bit product. The product is broken +| into two 64-bit pieces which are stored at the locations pointed to by +| `z0Ptr' and `z1Ptr'. +*----------------------------------------------------------------------------*/ + +inline void mul64To128(uint64_t a, uint64_t b, uint64_t *z0Ptr, uint64_t *z1Ptr) +{ + uint32_t aHigh, aLow, bHigh, bLow; + uint64_t z0, zMiddleA, zMiddleB, z1; + + aLow = (uint32_t) a; + aHigh = (uint32_t)(a>>32); + bLow = (uint32_t) b; + bHigh = (uint32_t)(b>>32); + z1 = ((uint64_t) aLow) * bLow; + zMiddleA = ((uint64_t) aLow) * bHigh; + zMiddleB = ((uint64_t) aHigh) * bLow; + z0 = ((uint64_t) aHigh) * bHigh; + zMiddleA += zMiddleB; + z0 += (((uint64_t) (zMiddleA < zMiddleB))<<32) + (zMiddleA>>32); + zMiddleA <<= 32; + z1 += zMiddleA; + z0 += (z1 < zMiddleA); + *z1Ptr = z1; + *z0Ptr = z0; +} + +/*---------------------------------------------------------------------------- +| Returns an approximation to the 64-bit integer quotient obtained by dividing +| `b' into the 128-bit value formed by concatenating `a0' and `a1'. The +| divisor `b' must be at least 2^63. If q is the exact quotient truncated +| toward zero, the approximation returned lies between q and q + 2 inclusive. +| If the exact quotient q is larger than 64 bits, the maximum positive 64-bit +| unsigned integer is returned. +*----------------------------------------------------------------------------*/ + +#ifdef USE_estimateDiv128To64 +static uint64_t estimateDiv128To64(uint64_t a0, uint64_t a1, uint64_t b) +{ + uint64_t b0, b1; + uint64_t rem0, rem1, term0, term1; + uint64_t z; + + if (b <= a0) return UINT64_C(0xFFFFFFFFFFFFFFFF); + b0 = b>>32; + z = (b0<<32 <= a0) ? UINT64_C(0xFFFFFFFF00000000) : (a0 / b0)<<32; + mul64To128(b, z, &term0, &term1); + sub128(a0, a1, term0, term1, &rem0, &rem1); + while (((int64_t) rem0) < 0) { + z -= UINT64_C(0x100000000); + b1 = b<<32; + add128(rem0, rem1, b0, b1, &rem0, &rem1); + } + rem0 = (rem0<<32) | (rem1>>32); + z |= (b0<<32 <= rem0) ? 0xFFFFFFFF : rem0 / b0; + return z; +} +#endif + +/*---------------------------------------------------------------------------- +| Returns an approximation to the square root of the 32-bit significand given +| by `a'. Considered as an integer, `a' must be at least 2^31. If bit 0 of +| `aExp' (the least significant bit) is 1, the integer returned approximates +| 2^31*sqrt(`a'/2^31), where `a' is considered an integer. If bit 0 of `aExp' +| is 0, the integer returned approximates 2^31*sqrt(`a'/2^30). In either +| case, the approximation returned lies strictly within +/-2 of the exact +| value. +*----------------------------------------------------------------------------*/ + +#ifdef USE_estimateSqrt32 +static uint32_t estimateSqrt32(int16_t aExp, uint32_t a) +{ + static const uint16_t sqrtOddAdjustments[] = { + 0x0004, 0x0022, 0x005D, 0x00B1, 0x011D, 0x019F, 0x0236, 0x02E0, + 0x039C, 0x0468, 0x0545, 0x0631, 0x072B, 0x0832, 0x0946, 0x0A67 + }; + static const uint16_t sqrtEvenAdjustments[] = { + 0x0A2D, 0x08AF, 0x075A, 0x0629, 0x051A, 0x0429, 0x0356, 0x029E, + 0x0200, 0x0179, 0x0109, 0x00AF, 0x0068, 0x0034, 0x0012, 0x0002 + }; + uint32_t z; + + int index = (a>>27) & 15; + if (aExp & 1) { + z = 0x4000 + (a>>17) - sqrtOddAdjustments[index]; + z = ((a / z)<<14) + (z<<15); + a >>= 1; + } + else { + z = 0x8000 + (a>>17) - sqrtEvenAdjustments[index]; + z = a / z + z; + z = (0x20000 <= z) ? 0xFFFF8000 : (z<<15); + if (z <= a) return (uint32_t) (((int32_t) a)>>1); + } + return ((uint32_t) ((((uint64_t) a)<<31) / z)) + (z>>1); +} +#endif + +static const int countLeadingZeros8[] = { + 8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +#ifdef FLOAT16 + +/*---------------------------------------------------------------------------- +| Returns the number of leading 0 bits before the most-significant 1 bit of +| `a'. If `a' is zero, 16 is returned. +*----------------------------------------------------------------------------*/ + +inline int countLeadingZeros16(uint16_t a) +{ + int shiftCount = 0; + if (a < 0x100) { + shiftCount += 8; + a <<= 8; + } + shiftCount += countLeadingZeros8[a>>8]; + return shiftCount; +} + +#endif + +/*---------------------------------------------------------------------------- +| Returns the number of leading 0 bits before the most-significant 1 bit of +| `a'. If `a' is zero, 32 is returned. +*----------------------------------------------------------------------------*/ + +inline int countLeadingZeros32(uint32_t a) +{ + int shiftCount = 0; + if (a < 0x10000) { + shiftCount += 16; + a <<= 16; + } + if (a < 0x1000000) { + shiftCount += 8; + a <<= 8; + } + shiftCount += countLeadingZeros8[a>>24]; + return shiftCount; +} + +/*---------------------------------------------------------------------------- +| Returns the number of leading 0 bits before the most-significant 1 bit of +| `a'. If `a' is zero, 64 is returned. +*----------------------------------------------------------------------------*/ + +inline int countLeadingZeros64(uint64_t a) +{ + int shiftCount = 0; + if (a < UINT64_C(0x100000000)) { + shiftCount += 32; + } + else { + a >>= 32; + } + shiftCount += countLeadingZeros32((uint32_t)(a)); + return shiftCount; +} + +#ifdef FLOATX80 + +/*---------------------------------------------------------------------------- +| Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the +| number of bits given in `count'. Any bits shifted off are lost. The value +| of `count' can be arbitrarily large; in particular, if `count' is greater +| than 128, the result will be 0. The result is broken into two 64-bit pieces +| which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'. +*----------------------------------------------------------------------------*/ + +inline void shift128Right(uint64_t a0, uint64_t a1, int count, uint64_t *z0Ptr, uint64_t *z1Ptr) +{ + uint64_t z0, z1; + int negCount = (-count) & 63; + + if (count == 0) { + z1 = a1; + z0 = a0; + } + else if (count < 64) { + z1 = (a0<>count); + z0 = a0>>count; + } + else { + z1 = (count < 64) ? (a0>>(count & 63)) : 0; + z0 = 0; + } + *z1Ptr = z1; + *z0Ptr = z0; +} + +/*---------------------------------------------------------------------------- +| Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the +| number of bits given in `count'. If any nonzero bits are shifted off, they +| are ``jammed'' into the least significant bit of the result by setting the +| least significant bit to 1. The value of `count' can be arbitrarily large; +| in particular, if `count' is greater than 128, the result will be either +| 0 or 1, depending on whether the concatenation of `a0' and `a1' is zero or +| nonzero. The result is broken into two 64-bit pieces which are stored at +| the locations pointed to by `z0Ptr' and `z1Ptr'. +*----------------------------------------------------------------------------*/ + +inline void shift128RightJamming(uint64_t a0, uint64_t a1, int count, uint64_t *z0Ptr, uint64_t *z1Ptr) +{ + uint64_t z0, z1; + int negCount = (-count) & 63; + + if (count == 0) { + z1 = a1; + z0 = a0; + } + else if (count < 64) { + z1 = (a0<>count) | ((a1<>count; + } + else { + if (count == 64) { + z1 = a0 | (a1 != 0); + } + else if (count < 128) { + z1 = (a0>>(count & 63)) | (((a0<>((-count) & 63)); +} + +/*---------------------------------------------------------------------------- +| Adds the 192-bit value formed by concatenating `a0', `a1', and `a2' to the +| 192-bit value formed by concatenating `b0', `b1', and `b2'. Addition is +| modulo 2^192, so any carry out is lost. The result is broken into three +| 64-bit pieces which are stored at the locations pointed to by `z0Ptr', +| `z1Ptr', and `z2Ptr'. +*----------------------------------------------------------------------------*/ + +inline void add192( + uint64_t a0, + uint64_t a1, + uint64_t a2, + uint64_t b0, + uint64_t b1, + uint64_t b2, + uint64_t *z0Ptr, + uint64_t *z1Ptr, + uint64_t *z2Ptr +) +{ + uint64_t z0, z1, z2; + unsigned carry0, carry1; + + z2 = a2 + b2; + carry1 = (z2 < a2); + z1 = a1 + b1; + carry0 = (z1 < a1); + z0 = a0 + b0; + z1 += carry1; + z0 += (z1 < carry1); + z0 += carry0; + *z2Ptr = z2; + *z1Ptr = z1; + *z0Ptr = z0; +} + +/*---------------------------------------------------------------------------- +| Subtracts the 192-bit value formed by concatenating `b0', `b1', and `b2' +| from the 192-bit value formed by concatenating `a0', `a1', and `a2'. +| Subtraction is modulo 2^192, so any borrow out (carry out) is lost. The +| result is broken into three 64-bit pieces which are stored at the locations +| pointed to by `z0Ptr', `z1Ptr', and `z2Ptr'. +*----------------------------------------------------------------------------*/ + +inline void sub192( + uint64_t a0, + uint64_t a1, + uint64_t a2, + uint64_t b0, + uint64_t b1, + uint64_t b2, + uint64_t *z0Ptr, + uint64_t *z1Ptr, + uint64_t *z2Ptr +) +{ + uint64_t z0, z1, z2; + unsigned borrow0, borrow1; + + z2 = a2 - b2; + borrow1 = (a2 < b2); + z1 = a1 - b1; + borrow0 = (a1 < b1); + z0 = a0 - b0; + z0 -= (z1 < borrow1); + z1 -= borrow1; + z0 -= borrow0; + *z2Ptr = z2; + *z1Ptr = z1; + *z0Ptr = z0; +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' +| is equal to the 128-bit value formed by concatenating `b0' and `b1'. +| Otherwise, returns 0. +*----------------------------------------------------------------------------*/ + +inline int eq128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1) +{ + return (a0 == b0) && (a1 == b1); +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less +| than or equal to the 128-bit value formed by concatenating `b0' and `b1'. +| Otherwise, returns 0. +*----------------------------------------------------------------------------*/ + +inline int le128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1) +{ + return (a0 < b0) || ((a0 == b0) && (a1 <= b1)); +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less +| than the 128-bit value formed by concatenating `b0' and `b1'. Otherwise, +| returns 0. +*----------------------------------------------------------------------------*/ + +inline int lt128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1) +{ + return (a0 < b0) || ((a0 == b0) && (a1 < b1)); +} + +#endif /* FLOATX80 */ + +/*---------------------------------------------------------------------------- +| Multiplies the 128-bit value formed by concatenating `a0' and `a1' by +| `b' to obtain a 192-bit product. The product is broken into three 64-bit +| pieces which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and +| `z2Ptr'. +*----------------------------------------------------------------------------*/ + +inline void mul128By64To192( + uint64_t a0, + uint64_t a1, + uint64_t b, + uint64_t *z0Ptr, + uint64_t *z1Ptr, + uint64_t *z2Ptr +) +{ + uint64_t z0, z1, z2, more1; + + mul64To128(a1, b, &z1, &z2); + mul64To128(a0, b, &z0, &more1); + add128(z0, more1, 0, z1, &z0, &z1); + *z2Ptr = z2; + *z1Ptr = z1; + *z0Ptr = z0; +} + +#ifdef FLOAT128 + +/*---------------------------------------------------------------------------- +| Multiplies the 128-bit value formed by concatenating `a0' and `a1' to the +| 128-bit value formed by concatenating `b0' and `b1' to obtain a 256-bit +| product. The product is broken into four 64-bit pieces which are stored at +| the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'. +*----------------------------------------------------------------------------*/ + +inline void mul128To256( + uint64_t a0, + uint64_t a1, + uint64_t b0, + uint64_t b1, + uint64_t *z0Ptr, + uint64_t *z1Ptr, + uint64_t *z2Ptr, + uint64_t *z3Ptr +) +{ + uint64_t z0, z1, z2, z3; + uint64_t more1, more2; + + mul64To128(a1, b1, &z2, &z3); + mul64To128(a1, b0, &z1, &more2); + add128(z1, more2, 0, z2, &z1, &z2); + mul64To128(a0, b0, &z0, &more1); + add128(z0, more1, 0, z1, &z0, &z1); + mul64To128(a0, b1, &more1, &more2); + add128(more1, more2, 0, z2, &more1, &z2); + add128(z0, z1, 0, more1, &z0, &z1); + *z3Ptr = z3; + *z2Ptr = z2; + *z1Ptr = z1; + *z0Ptr = z0; +} + + +/*---------------------------------------------------------------------------- +| Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' right +| by 64 _plus_ the number of bits given in `count'. The shifted result is +| at most 128 nonzero bits; these are broken into two 64-bit pieces which are +| stored at the locations pointed to by `z0Ptr' and `z1Ptr'. The bits shifted +| off form a third 64-bit result as follows: The _last_ bit shifted off is +| the most-significant bit of the extra result, and the other 63 bits of the +| extra result are all zero if and only if _all_but_the_last_ bits shifted off +| were all zero. This extra result is stored in the location pointed to by +| `z2Ptr'. The value of `count' can be arbitrarily large. +| (This routine makes more sense if `a0', `a1', and `a2' are considered +| to form a fixed-point value with binary point between `a1' and `a2'. This +| fixed-point value is shifted right by the number of bits given in `count', +| and the integer part of the result is returned at the locations pointed to +| by `z0Ptr' and `z1Ptr'. The fractional part of the result may be slightly +| corrupted as described above, and is returned at the location pointed to by +| `z2Ptr'.) +*----------------------------------------------------------------------------*/ + +inline void shift128ExtraRightJamming( + uint64_t a0, + uint64_t a1, + uint64_t a2, + int count, + uint64_t *z0Ptr, + uint64_t *z1Ptr, + uint64_t *z2Ptr +) +{ + uint64_t z0, z1, z2; + int negCount = (-count) & 63; + + if (count == 0) { + z2 = a2; + z1 = a1; + z0 = a0; + } + else { + if (count < 64) { + z2 = a1<>count); + z0 = a0>>count; + } + else { + if (count == 64) { + z2 = a1; + z1 = a0; + } + else { + a2 |= a1; + if (count < 128) { + z2 = a0<>(count & 63); + } + else { + z2 = (count == 128) ? a0 : (a0 != 0); + z1 = 0; + } + } + z0 = 0; + } + z2 |= (a2 != 0); + } + *z2Ptr = z2; + *z1Ptr = z1; + *z0Ptr = z0; +} + +#endif /* FLOAT128 */ + +END_SOFTFLOAT_NS + +#endif diff -r 372d3611c693 -r e89a7cf22da3 ext/softfloat/softfloat-muladd.cc --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ext/softfloat/softfloat-muladd.cc Thu Sep 19 18:09:15 2013 +0200 @@ -0,0 +1,562 @@ +/*============================================================================ +This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic +Package, Release 2b. + +Written by John R. Hauser. This work was made possible in part by the +International Computer Science Institute, located at Suite 600, 1947 Center +Street, Berkeley, California 94704. Funding was partially provided by the +National Science Foundation under grant MIP-9311980. The original version +of this code was written as part of a project to build a fixed-point vector +processor in collaboration with the University of California at Berkeley, +overseen by Profs. Nelson Morgan and John Wawrzynek. More information +is available through the Web page `http://www.cs.berkeley.edu/~jhauser/ +arithmetic/SoftFloat.html'. + +THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has +been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES +RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS +AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES, +COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE +EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE +INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR +OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE. + +Derivative works are acceptable, even for commercial purposes, so long as +(1) the source code for the derivative work includes prominent notice that +the work is derivative, and (2) the source code includes prominent notice with +these four paragraphs for those parts of this code that are retained. +=============================================================================*/ + +/*============================================================================ + * This code is based on QEMU patch by Peter Maydell + * Adapted for Bochs (x86 achitecture simulator) by + * Stanislav Shwartsman [sshwarts at sourceforge net] + * ==========================================================================*/ + +#include "softfloat.hh" +#include "softfloat-round-pack.hh" + +/*---------------------------------------------------------------------------- +| Primitive arithmetic functions, including multi-word arithmetic, and +| division and square root approximations. (Can be specialized to target +| if desired). +*----------------------------------------------------------------------------*/ +#include "softfloat-macros.hh" + +/*---------------------------------------------------------------------------- +| Functions and definitions to determine: (1) whether tininess for underflow +| is detected before or after rounding by default, (2) what (if anything) +| happens when exceptions are raised, (3) how signaling NaNs are distinguished +| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs +| are propagated from function inputs to output. These details are target- +| specific. +*----------------------------------------------------------------------------*/ +#include "softfloat-specialize.hh" + +/*---------------------------------------------------------------------------- +| Takes three single-precision floating-point values `a', `b' and `c', one of +| which is a NaN, and returns the appropriate NaN result. If any of `a', +| `b' or `c' is a signaling NaN, the invalid exception is raised. +| The input infzero indicates whether a*b was 0*inf or inf*0 (in which case +| obviously c is a NaN, and whether to propagate c or some other NaN is +| implementation defined). +*----------------------------------------------------------------------------*/ + +BEGIN_SOFTFLOAT_NS + +static float32 propagateFloat32MulAddNaN(float32 a, float32 b, float32 c, float_status_t &status) +{ + int aIsNaN = float32_is_nan(a); + int bIsNaN = float32_is_nan(b); + + int aIsSignalingNaN = float32_is_signaling_nan(a); + int bIsSignalingNaN = float32_is_signaling_nan(b); + int cIsSignalingNaN = float32_is_signaling_nan(c); + + a |= 0x00400000; + b |= 0x00400000; + c |= 0x00400000; + + if (aIsSignalingNaN | bIsSignalingNaN | cIsSignalingNaN) + float_raise(status, float_flag_invalid); + + // operate according to float_first_operand_nan mode + if (aIsSignalingNaN | aIsNaN) { + return a; + } + else { + return (bIsSignalingNaN | bIsNaN) ? b : c; + } +} + +/*---------------------------------------------------------------------------- +| Takes three double-precision floating-point values `a', `b' and `c', one of +| which is a NaN, and returns the appropriate NaN result. If any of `a', +| `b' or `c' is a signaling NaN, the invalid exception is raised. +| The input infzero indicates whether a*b was 0*inf or inf*0 (in which case +| obviously c is a NaN, and whether to propagate c or some other NaN is +| implementation defined). +*----------------------------------------------------------------------------*/ + +static float64 propagateFloat64MulAddNaN(float64 a, float64 b, float64 c, float_status_t &status) +{ + int aIsNaN = float64_is_nan(a); + int bIsNaN = float64_is_nan(b); + + int aIsSignalingNaN = float64_is_signaling_nan(a); + int bIsSignalingNaN = float64_is_signaling_nan(b); + int cIsSignalingNaN = float64_is_signaling_nan(c); + + a |= UINT64_C(0x0008000000000000); + b |= UINT64_C(0x0008000000000000); + c |= UINT64_C(0x0008000000000000); + + if (aIsSignalingNaN | bIsSignalingNaN | cIsSignalingNaN) + float_raise(status, float_flag_invalid); + + // operate according to float_first_operand_nan mode + if (aIsSignalingNaN | aIsNaN) { + return a; + } + else { + return (bIsSignalingNaN | bIsNaN) ? b : c; + } +} + +/*---------------------------------------------------------------------------- +| Returns the result of multiplying the single-precision floating-point values +| `a' and `b' then adding 'c', with no intermediate rounding step after the +| multiplication. The operation is performed according to the IEC/IEEE +| Standard for Binary Floating-Point Arithmetic 754-2008. +| The flags argument allows the caller to select negation of the +| addend, the intermediate product, or the final result. (The difference +| between this and having the caller do a separate negation is that negating +| externally will flip the sign bit on NaNs.) +*----------------------------------------------------------------------------*/ + +float32 float32_muladd(float32 a, float32 b, float32 c, int flags, float_status_t &status) +{ + int aSign, bSign, cSign, zSign; + int16_t aExp, bExp, cExp, pExp, zExp; + uint32_t aSig, bSig, cSig; + int pInf, pZero, pSign; + uint64_t pSig64, cSig64, zSig64; + uint32_t pSig; + int shiftcount; + + aSig = extractFloat32Frac(a); + aExp = extractFloat32Exp(a); + aSign = extractFloat32Sign(a); + bSig = extractFloat32Frac(b); + bExp = extractFloat32Exp(b); + bSign = extractFloat32Sign(b); + cSig = extractFloat32Frac(c); + cExp = extractFloat32Exp(c); + cSign = extractFloat32Sign(c); + + /* It is implementation-defined whether the cases of (0,inf,qnan) + * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN + * they return if they do), so we have to hand this information + * off to the target-specific pick-a-NaN routine. + */ + if (((aExp == 0xff) && aSig) || + ((bExp == 0xff) && bSig) || + ((cExp == 0xff) && cSig)) { + return propagateFloat32MulAddNaN(a, b, c, status); + } + + if (get_denormals_are_zeros(status)) { + if (aExp == 0) aSig = 0; + if (bExp == 0) bSig = 0; + if (cExp == 0) cSig = 0; + } + + int infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) || + (aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0)); + + if (infzero) { + float_raise(status, float_flag_invalid); + return float32_default_nan; + } + + if (flags & float_muladd_negate_c) { + cSign ^= 1; + } + + /* Work out the sign and type of the product */ + pSign = aSign ^ bSign; + if (flags & float_muladd_negate_product) { + pSign ^= 1; + } + pInf = (aExp == 0xff) || (bExp == 0xff); + pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0); + + if (cExp == 0xff) { + if (pInf && (pSign ^ cSign)) { + /* addition of opposite-signed infinities => InvalidOperation */ + float_raise(status, float_flag_invalid); + return float32_default_nan; + } + /* Otherwise generate an infinity of the same sign */ + if ((aSig && aExp == 0) || (bSig && bExp == 0)) { + float_raise(status, float_flag_denormal); + } + return packFloat32(cSign, 0xff, 0); + } + + if (pInf) { + if ((aSig && aExp == 0) || (bSig && bExp == 0) || (cSig && cExp == 0)) { + float_raise(status, float_flag_denormal); + } + return packFloat32(pSign, 0xff, 0); + } + + if (pZero) { + if (cExp == 0) { + if (cSig == 0) { + /* Adding two exact zeroes */ + if (pSign == cSign) { + zSign = pSign; + } else if (get_float_rounding_mode(status) == float_round_down) { + zSign = 1; + } else { + zSign = 0; + } + return packFloat32(zSign, 0, 0); + } + /* Exact zero plus a denormal */ + float_raise(status, float_flag_denormal); + if (get_flush_underflow_to_zero(status)) { + float_raise(status, float_flag_underflow | float_flag_inexact); + return packFloat32(cSign, 0, 0); + } + } + /* Zero plus something non-zero */ + return packFloat32(cSign, cExp, cSig); + } + + if (aExp == 0) { + float_raise(status, float_flag_denormal); + normalizeFloat32Subnormal(aSig, &aExp, &aSig); + } + if (bExp == 0) { + float_raise(status, float_flag_denormal); + normalizeFloat32Subnormal(bSig, &bExp, &bSig); + } + + /* Calculate the actual result a * b + c */ + + /* Multiply first; this is easy. */ + /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f + * because we want the true exponent, not the "one-less-than" + * flavour that roundAndPackFloat32() takes. + */ + pExp = aExp + bExp - 0x7e; + aSig = (aSig | 0x00800000) << 7; + bSig = (bSig | 0x00800000) << 8; + pSig64 = (uint64_t)aSig * bSig; + if ((int64_t)(pSig64 << 1) >= 0) { + pSig64 <<= 1; + pExp--; + } + + zSign = pSign; + + /* Now pSig64 is the significand of the multiply, with the explicit bit in + * position 62. + */ + if (cExp == 0) { + if (!cSig) { + /* Throw out the special case of c being an exact zero now */ + pSig = (uint32_t) shift64RightJamming(pSig64, 32); + return roundAndPackFloat32(zSign, pExp - 1, pSig, status); + } + float_raise(status, float_flag_denormal); + normalizeFloat32Subnormal(cSig, &cExp, &cSig); + } + + cSig64 = (uint64_t)cSig << 39; + cSig64 |= UINT64_C(0x4000000000000000); + int expDiff = pExp - cExp; + + if (pSign == cSign) { + /* Addition */ + if (expDiff > 0) { + /* scale c to match p */ + cSig64 = shift64RightJamming(cSig64, expDiff); + zExp = pExp; + } else if (expDiff < 0) { + /* scale p to match c */ + pSig64 = shift64RightJamming(pSig64, -expDiff); + zExp = cExp; + } else { + /* no scaling needed */ + zExp = cExp; + } + /* Add significands and make sure explicit bit ends up in posn 62 */ + zSig64 = pSig64 + cSig64; + if ((int64_t)zSig64 < 0) { + zSig64 = shift64RightJamming(zSig64, 1); + } else { + zExp--; + } + zSig64 = shift64RightJamming(zSig64, 32); + return roundAndPackFloat32(zSign, zExp, zSig64, status); + } else { + /* Subtraction */ + if (expDiff > 0) { + cSig64 = shift64RightJamming(cSig64, expDiff); + zSig64 = pSig64 - cSig64; + zExp = pExp; + } else if (expDiff < 0) { + pSig64 = shift64RightJamming(pSig64, -expDiff); + zSig64 = cSig64 - pSig64; + zExp = cExp; + zSign ^= 1; + } else { + zExp = pExp; + if (cSig64 < pSig64) { + zSig64 = pSig64 - cSig64; + } else if (pSig64 < cSig64) { + zSig64 = cSig64 - pSig64; + zSign ^= 1; + } else { + /* Exact zero */ + return packFloat32(get_float_rounding_mode(status) == float_round_down, 0, 0); + } + } + --zExp; + /* Do the equivalent of normalizeRoundAndPackFloat32() but + * starting with the significand in a uint64_t. + */ + shiftcount = countLeadingZeros64(zSig64) - 1; + zSig64 <<= shiftcount; + zExp -= shiftcount; + zSig64 = shift64RightJamming(zSig64, 32); + return roundAndPackFloat32(zSign, zExp, zSig64, status); + } +} + +/*---------------------------------------------------------------------------- +| Returns the result of multiplying the double-precision floating-point values +| `a' and `b' then adding 'c', with no intermediate rounding step after the +| multiplication. The operation is performed according to the IEC/IEEE +| Standard for Binary Floating-Point Arithmetic 754-2008. +| The flags argument allows the caller to select negation of the +| addend, the intermediate product, or the final result. (The difference +| between this and having the caller do a separate negation is that negating +| externally will flip the sign bit on NaNs.) +*----------------------------------------------------------------------------*/ + +float64 float64_muladd(float64 a, float64 b, float64 c, int flags, float_status_t &status) +{ + int aSign, bSign, cSign, zSign; + int16_t aExp, bExp, cExp, pExp, zExp; + uint64_t aSig, bSig, cSig; + int pInf, pZero, pSign; + uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1; + int shiftcount; + + aSig = extractFloat64Frac(a); + aExp = extractFloat64Exp(a); + aSign = extractFloat64Sign(a); + bSig = extractFloat64Frac(b); + bExp = extractFloat64Exp(b); + bSign = extractFloat64Sign(b); + cSig = extractFloat64Frac(c); + cExp = extractFloat64Exp(c); + cSign = extractFloat64Sign(c); + + /* It is implementation-defined whether the cases of (0,inf,qnan) + * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN + * they return if they do), so we have to hand this information + * off to the target-specific pick-a-NaN routine. + */ + if (((aExp == 0x7ff) && aSig) || + ((bExp == 0x7ff) && bSig) || + ((cExp == 0x7ff) && cSig)) { + return propagateFloat64MulAddNaN(a, b, c, status); + } + + if (get_denormals_are_zeros(status)) { + if (aExp == 0) aSig = 0; + if (bExp == 0) bSig = 0; + if (cExp == 0) cSig = 0; + } + + int infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) || + (aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0)); + + if (infzero) { + float_raise(status, float_flag_invalid); + return float64_default_nan; + } + + if (flags & float_muladd_negate_c) { + cSign ^= 1; + } + + /* Work out the sign and type of the product */ + pSign = aSign ^ bSign; + if (flags & float_muladd_negate_product) { + pSign ^= 1; + } + pInf = (aExp == 0x7ff) || (bExp == 0x7ff); + pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0); + + if (cExp == 0x7ff) { + if (pInf && (pSign ^ cSign)) { + /* addition of opposite-signed infinities => InvalidOperation */ + float_raise(status, float_flag_invalid); + return float64_default_nan; + } + /* Otherwise generate an infinity of the same sign */ + if ((aSig && aExp == 0) || (bSig && bExp == 0)) { + float_raise(status, float_flag_denormal); + } + return packFloat64(cSign, 0x7ff, 0); + } + + if (pInf) { + if ((aSig && aExp == 0) || (bSig && bExp == 0) || (cSig && cExp == 0)) { + float_raise(status, float_flag_denormal); + } + return packFloat64(pSign, 0x7ff, 0); + } + + if (pZero) { + if (cExp == 0) { + if (cSig == 0) { + /* Adding two exact zeroes */ + if (pSign == cSign) { + zSign = pSign; + } else if (get_float_rounding_mode(status) == float_round_down) { + zSign = 1; + } else { + zSign = 0; + } + return packFloat64(zSign, 0, 0); + } + /* Exact zero plus a denormal */ + float_raise(status, float_flag_denormal); + if (get_flush_underflow_to_zero(status)) { + float_raise(status, float_flag_underflow | float_flag_inexact); + return packFloat64(cSign, 0, 0); + } + } + /* Zero plus something non-zero */ + return packFloat64(cSign, cExp, cSig); + } + + if (aExp == 0) { + float_raise(status, float_flag_denormal); + normalizeFloat64Subnormal(aSig, &aExp, &aSig); + } + if (bExp == 0) { + float_raise(status, float_flag_denormal); + normalizeFloat64Subnormal(bSig, &bExp, &bSig); + } + + /* Calculate the actual result a * b + c */ + + /* Multiply first; this is easy. */ + /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff + * because we want the true exponent, not the "one-less-than" + * flavour that roundAndPackFloat64() takes. + */ + pExp = aExp + bExp - 0x3fe; + aSig = (aSig | UINT64_C(0x0010000000000000))<<10; + bSig = (bSig | UINT64_C(0x0010000000000000))<<11; + mul64To128(aSig, bSig, &pSig0, &pSig1); + if ((int64_t)(pSig0 << 1) >= 0) { + shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1); + pExp--; + } + + zSign = pSign; + + /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit + * bit in position 126. + */ + if (cExp == 0) { + if (!cSig) { + /* Throw out the special case of c being an exact zero now */ + shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1); + return roundAndPackFloat64(zSign, pExp - 1, pSig1, status); + } + float_raise(status, float_flag_denormal); + normalizeFloat64Subnormal(cSig, &cExp, &cSig); + } + + cSig0 = cSig << 10; + cSig1 = 0; + cSig0 |= UINT64_C(0x4000000000000000); + int expDiff = pExp - cExp; + + if (pSign == cSign) { + /* Addition */ + if (expDiff > 0) { + /* scale c to match p */ + shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1); + zExp = pExp; + } else if (expDiff < 0) { + /* scale p to match c */ + shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1); + zExp = cExp; + } else { + /* no scaling needed */ + zExp = cExp; + } + /* Add significands and make sure explicit bit ends up in posn 126 */ + add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1); + if ((int64_t)zSig0 < 0) { + shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1); + } else { + zExp--; + } + shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1); + return roundAndPackFloat64(zSign, zExp, zSig1, status); + } else { + /* Subtraction */ + if (expDiff > 0) { + shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1); + sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1); + zExp = pExp; + } else if (expDiff < 0) { + shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1); + sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1); + zExp = cExp; + zSign ^= 1; + } else { + zExp = pExp; + if (lt128(cSig0, cSig1, pSig0, pSig1)) { + sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1); + } else if (lt128(pSig0, pSig1, cSig0, cSig1)) { + sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1); + zSign ^= 1; + } else { + /* Exact zero */ + return packFloat64(get_float_rounding_mode(status) == float_round_down, 0, 0); + } + } + --zExp; + /* Do the equivalent of normalizeRoundAndPackFloat64() but + * starting with the significand in a pair of uint64_t. + */ + if (zSig0) { + shiftcount = countLeadingZeros64(zSig0) - 1; + shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1); + if (zSig1) { + zSig0 |= 1; + } + zExp -= shiftcount; + } else { + shiftcount = countLeadingZeros64(zSig1) - 1; + zSig0 = zSig1 << shiftcount; + zExp -= (shiftcount + 64); + } + return roundAndPackFloat64(zSign, zExp, zSig0, status); + } +} + +END_SOFTFLOAT_NS diff -r 372d3611c693 -r e89a7cf22da3 ext/softfloat/softfloat-round-pack.hh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ext/softfloat/softfloat-round-pack.hh Thu Sep 19 18:09:15 2013 +0200 @@ -0,0 +1,301 @@ +/*============================================================================ +This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic +Package, Release 2b. + +Written by John R. Hauser. This work was made possible in part by the +International Computer Science Institute, located at Suite 600, 1947 Center +Street, Berkeley, California 94704. Funding was partially provided by the +National Science Foundation under grant MIP-9311980. The original version +of this code was written as part of a project to build a fixed-point vector +processor in collaboration with the University of California at Berkeley, +overseen by Profs. Nelson Morgan and John Wawrzynek. More information +is available through the Web page `http://www.cs.berkeley.edu/~jhauser/ +arithmetic/SoftFloat.html'. + +THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has +been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES +RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS +AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES, +COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE +EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE +INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR +OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE. + +Derivative works are acceptable, even for commercial purposes, so long as +(1) the source code for the derivative work includes prominent notice that +the work is derivative, and (2) the source code includes prominent notice with +these four paragraphs for those parts of this code that are retained. +=============================================================================*/ + +/*============================================================================ + * Adapted for Bochs (x86 achitecture simulator) by + * Stanislav Shwartsman [sshwarts at sourceforge net] + * ==========================================================================*/ + +#ifndef _SOFTFLOAT_ROUND_PACK_H_ +#define _SOFTFLOAT_ROUND_PACK_H_ + +#include "softfloat.hh" + +BEGIN_SOFTFLOAT_NS + +/*---------------------------------------------------------------------------- +| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 +| and 7, and returns the properly rounded 32-bit integer corresponding to the +| input. If `zSign' is 1, the input is negated before being converted to an +| integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input +| is simply rounded to an integer, with the inexact exception raised if the +| input cannot be represented exactly as an integer. However, if the fixed- +| point input is too large, the invalid exception is raised and the integer +| indefinite value is returned. +*----------------------------------------------------------------------------*/ + +int32_t roundAndPackInt32(int zSign, uint64_t absZ, float_status_t &status); + +/*---------------------------------------------------------------------------- +| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and +| `absZ1', with binary point between bits 63 and 64 (between the input words), +| and returns the properly rounded 64-bit integer corresponding to the input. +| If `zSign' is 1, the input is negated before being converted to an integer. +| Ordinarily, the fixed-point input is simply rounded to an integer, with +| the inexact exception raised if the input cannot be represented exactly as +| an integer. However, if the fixed-point input is too large, the invalid +| exception is raised and the integer indefinite value is returned. +*----------------------------------------------------------------------------*/ + +int64_t roundAndPackInt64(int zSign, uint64_t absZ0, uint64_t absZ1, float_status_t &status); + +#ifdef FLOAT16 + +/*---------------------------------------------------------------------------- +| Normalizes the subnormal half-precision floating-point value represented +| by the denormalized significand `aSig'. The normalized exponent and +| significand are stored at the locations pointed to by `zExpPtr' and +| `zSigPtr', respectively. +*----------------------------------------------------------------------------*/ + +void normalizeFloat16Subnormal(uint16_t aSig, int16_t *zExpPtr, uint16_t *zSigPtr); + +/*---------------------------------------------------------------------------- +| Takes an abstract floating-point value having sign `zSign', exponent `zExp', +| and significand `zSig', and returns the proper half-precision floating- +| point value corresponding to the abstract input. Ordinarily, the abstract +| value is simply rounded and packed into the half-precision format, with +| the inexact exception raised if the abstract input cannot be represented +| exactly. However, if the abstract value is too large, the overflow and +| inexact exceptions are raised and an infinity or maximal finite value is +| returned. If the abstract value is too small, the input value is rounded to +| a subnormal number, and the underflow and inexact exceptions are raised if +| the abstract input cannot be represented exactly as a subnormal single- +| precision floating-point number. +| The input significand `zSig' has its binary point between bits 14 +| and 13, which is 4 bits to the left of the usual location. This shifted +| significand must be normalized or smaller. If `zSig' is not normalized, +| `zExp' must be 0; in that case, the result returned is a subnormal number, +| and it must not require rounding. In the usual case that `zSig' is +| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. +| The handling of underflow and overflow follows the IEC/IEEE Standard for +| Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float16 roundAndPackFloat16(int zSign, int16_t zExp, uint16_t zSig, float_status_t &status); + +#endif + +/*---------------------------------------------------------------------------- +| Normalizes the subnormal single-precision floating-point value represented +| by the denormalized significand `aSig'. The normalized exponent and +| significand are stored at the locations pointed to by `zExpPtr' and +| `zSigPtr', respectively. +*----------------------------------------------------------------------------*/ + +void normalizeFloat32Subnormal(uint32_t aSig, int16_t *zExpPtr, uint32_t *zSigPtr); + +/*---------------------------------------------------------------------------- +| Takes an abstract floating-point value having sign `zSign', exponent `zExp', +| and significand `zSig', and returns the proper single-precision floating- +| point value corresponding to the abstract input. Ordinarily, the abstract +| value is simply rounded and packed into the single-precision format, with +| the inexact exception raised if the abstract input cannot be represented +| exactly. However, if the abstract value is too large, the overflow and +| inexact exceptions are raised and an infinity or maximal finite value is +| returned. If the abstract value is too small, the input value is rounded to +| a subnormal number, and the underflow and inexact exceptions are raised if +| the abstract input cannot be represented exactly as a subnormal single- +| precision floating-point number. +| The input significand `zSig' has its binary point between bits 30 +| and 29, which is 7 bits to the left of the usual location. This shifted +| significand must be normalized or smaller. If `zSig' is not normalized, +| `zExp' must be 0; in that case, the result returned is a subnormal number, +| and it must not require rounding. In the usual case that `zSig' is +| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. +| The handling of underflow and overflow follows the IEC/IEEE Standard for +| Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float32 roundAndPackFloat32(int zSign, int16_t zExp, uint32_t zSig, float_status_t &status); + +/*---------------------------------------------------------------------------- +| Takes an abstract floating-point value having sign `zSign', exponent `zExp', +| and significand `zSig', and returns the proper single-precision floating- +| point value corresponding to the abstract input. This routine is just like +| `roundAndPackFloat32' except that `zSig' does not have to be normalized. +| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' +| floating-point exponent. +*----------------------------------------------------------------------------*/ + +float32 normalizeRoundAndPackFloat32(int zSign, int16_t zExp, uint32_t zSig, float_status_t &status); + +/*---------------------------------------------------------------------------- +| Normalizes the subnormal double-precision floating-point value represented +| by the denormalized significand `aSig'. The normalized exponent and +| significand are stored at the locations pointed to by `zExpPtr' and +| `zSigPtr', respectively. +*----------------------------------------------------------------------------*/ + +void normalizeFloat64Subnormal(uint64_t aSig, int16_t *zExpPtr, uint64_t *zSigPtr); + +/*---------------------------------------------------------------------------- +| Takes an abstract floating-point value having sign `zSign', exponent `zExp', +| and significand `zSig', and returns the proper double-precision floating- +| point value corresponding to the abstract input. Ordinarily, the abstract +| value is simply rounded and packed into the double-precision format, with +| the inexact exception raised if the abstract input cannot be represented +| exactly. However, if the abstract value is too large, the overflow and +| inexact exceptions are raised and an infinity or maximal finite value is +| returned. If the abstract value is too small, the input value is rounded +| to a subnormal number, and the underflow and inexact exceptions are raised +| if the abstract input cannot be represented exactly as a subnormal double- +| precision floating-point number. +| The input significand `zSig' has its binary point between bits 62 +| and 61, which is 10 bits to the left of the usual location. This shifted +| significand must be normalized or smaller. If `zSig' is not normalized, +| `zExp' must be 0; in that case, the result returned is a subnormal number, +| and it must not require rounding. In the usual case that `zSig' is +| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. +| The handling of underflow and overflow follows the IEC/IEEE Standard for +| Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float64 roundAndPackFloat64(int zSign, int16_t zExp, uint64_t zSig, float_status_t &status); + +/*---------------------------------------------------------------------------- +| Takes an abstract floating-point value having sign `zSign', exponent `zExp', +| and significand `zSig', and returns the proper double-precision floating- +| point value corresponding to the abstract input. This routine is just like +| `roundAndPackFloat64' except that `zSig' does not have to be normalized. +| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' +| floating-point exponent. +*----------------------------------------------------------------------------*/ + +float64 normalizeRoundAndPackFloat64(int zSign, int16_t zExp, uint64_t zSig, float_status_t &status); + +#ifdef FLOATX80 + +/*---------------------------------------------------------------------------- +| Normalizes the subnormal extended double-precision floating-point value +| represented by the denormalized significand `aSig'. The normalized exponent +| and significand are stored at the locations pointed to by `zExpPtr' and +| `zSigPtr', respectively. +*----------------------------------------------------------------------------*/ + +void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr, uint64_t *zSigPtr); + +/*---------------------------------------------------------------------------- +| Takes an abstract floating-point value having sign `zSign', exponent `zExp', +| and extended significand formed by the concatenation of `zSig0' and `zSig1', +| and returns the proper extended double-precision floating-point value +| corresponding to the abstract input. Ordinarily, the abstract value is +| rounded and packed into the extended double-precision format, with the +| inexact exception raised if the abstract input cannot be represented +| exactly. However, if the abstract value is too large, the overflow and +| inexact exceptions are raised and an infinity or maximal finite value is +| returned. If the abstract value is too small, the input value is rounded to +| a subnormal number, and the underflow and inexact exceptions are raised if +| the abstract input cannot be represented exactly as a subnormal extended +| double-precision floating-point number. +| If `roundingPrecision' is 32 or 64, the result is rounded to the same +| number of bits as single or double precision, respectively. Otherwise, the +| result is rounded to the full precision of the extended double-precision +| format. +| The input significand must be normalized or smaller. If the input +| significand is not normalized, `zExp' must be 0; in that case, the result +| returned is a subnormal number, and it must not require rounding. The +| handling of underflow and overflow follows the IEC/IEEE Standard for Binary +| Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +floatx80 roundAndPackFloatx80(int roundingPrecision, + int zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1, float_status_t &status); + +/*---------------------------------------------------------------------------- +| Takes an abstract floating-point value having sign `zSign', exponent +| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', +| and returns the proper extended double-precision floating-point value +| corresponding to the abstract input. This routine is just like +| `roundAndPackFloatx80' except that the input significand does not have to be +| normalized. +*----------------------------------------------------------------------------*/ + +floatx80 normalizeRoundAndPackFloatx80(int roundingPrecision, + int zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1, float_status_t &status); + +#endif // FLOATX80 + +#ifdef FLOAT128 + +/*---------------------------------------------------------------------------- +| Normalizes the subnormal quadruple-precision floating-point value +| represented by the denormalized significand formed by the concatenation of +| `aSig0' and `aSig1'. The normalized exponent is stored at the location +| pointed to by `zExpPtr'. The most significant 49 bits of the normalized +| significand are stored at the location pointed to by `zSig0Ptr', and the +| least significant 64 bits of the normalized significand are stored at the +| location pointed to by `zSig1Ptr'. +*----------------------------------------------------------------------------*/ + +void normalizeFloat128Subnormal( + uint64_t aSig0, uint64_t aSig1, int32_t *zExpPtr, uint64_t *zSig0Ptr, uint64_t *zSig1Ptr); + +/*---------------------------------------------------------------------------- +| Takes an abstract floating-point value having sign `zSign', exponent `zExp', +| and extended significand formed by the concatenation of `zSig0', `zSig1', +| and `zSig2', and returns the proper quadruple-precision floating-point value +| corresponding to the abstract input. Ordinarily, the abstract value is +| simply rounded and packed into the quadruple-precision format, with the +| inexact exception raised if the abstract input cannot be represented +| exactly. However, if the abstract value is too large, the overflow and +| inexact exceptions are raised and an infinity or maximal finite value is +| returned. If the abstract value is too small, the input value is rounded to +| a subnormal number, and the underflow and inexact exceptions are raised if +| the abstract input cannot be represented exactly as a subnormal quadruple- +| precision floating-point number. +| The input significand must be normalized or smaller. If the input +| significand is not normalized, `zExp' must be 0; in that case, the result +| returned is a subnormal number, and it must not require rounding. In the +| usual case that the input significand is normalized, `zExp' must be 1 less +| than the ``true'' floating-point exponent. The handling of underflow and +| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float128 roundAndPackFloat128( + int zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1, uint64_t zSig2, float_status_t &status); + +/*---------------------------------------------------------------------------- +| Takes an abstract floating-point value having sign `zSign', exponent `zExp', +| and significand formed by the concatenation of `zSig0' and `zSig1', and +| returns the proper quadruple-precision floating-point value corresponding +| to the abstract input. This routine is just like `roundAndPackFloat128' +| except that the input significand has fewer bits and does not have to be +| normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- +| point exponent. +*----------------------------------------------------------------------------*/ + +float128 normalizeRoundAndPackFloat128( + int zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1, float_status_t &status); + +#endif // FLOAT128 + +END_SOFTFLOAT_NS + +#endif diff -r 372d3611c693 -r e89a7cf22da3 ext/softfloat/softfloat-round-pack.cc --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ext/softfloat/softfloat-round-pack.cc Thu Sep 19 18:09:15 2013 +0200 @@ -0,0 +1,854 @@ +/*============================================================================ +This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic +Package, Release 2b. + +Written by John R. Hauser. This work was made possible in part by the +International Computer Science Institute, located at Suite 600, 1947 Center +Street, Berkeley, California 94704. Funding was partially provided by the +National Science Foundation under grant MIP-9311980. The original version +of this code was written as part of a project to build a fixed-point vector +processor in collaboration with the University of California at Berkeley, +overseen by Profs. Nelson Morgan and John Wawrzynek. More information +is available through the Web page `http://www.cs.berkeley.edu/~jhauser/ +arithmetic/SoftFloat.html'. + +THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has +been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES +RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS +AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES, +COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE +EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE +INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR +OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE. + +Derivative works are acceptable, even for commercial purposes, so long as +(1) the source code for the derivative work includes prominent notice that +the work is derivative, and (2) the source code includes prominent notice with +these four paragraphs for those parts of this code that are retained. +=============================================================================*/ + +#define FLOAT128 + +/*============================================================================ + * Adapted for Bochs (x86 achitecture simulator) by + * Stanislav Shwartsman [sshwarts at sourceforge net] + * ==========================================================================*/ + +#include "softfloat.hh" +#include "softfloat-round-pack.hh" + +/*---------------------------------------------------------------------------- +| Primitive arithmetic functions, including multi-word arithmetic, and +| division and square root approximations. (Can be specialized to target +| if desired). +*----------------------------------------------------------------------------*/ +#include "softfloat-macros.hh" + +/*---------------------------------------------------------------------------- +| Functions and definitions to determine: (1) whether tininess for underflow +| is detected before or after rounding by default, (2) what (if anything) +| happens when exceptions are raised, (3) how signaling NaNs are distinguished +| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs +| are propagated from function inputs to output. These details are target- +| specific. +*----------------------------------------------------------------------------*/ +#include "softfloat-specialize.hh" + +BEGIN_SOFTFLOAT_NS + +/*---------------------------------------------------------------------------- +| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 +| and 7, and returns the properly rounded 32-bit integer corresponding to the +| input. If `zSign' is 1, the input is negated before being converted to an +| integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input +| is simply rounded to an integer, with the inexact exception raised if the +| input cannot be represented exactly as an integer. However, if the fixed- +| point input is too large, the invalid exception is raised and the integer +| indefinite value is returned. +*----------------------------------------------------------------------------*/ + +int32_t roundAndPackInt32(int zSign, uint64_t exactAbsZ, float_status_t &status) +{ + int roundingMode = get_float_rounding_mode(status); + int roundNearestEven = (roundingMode == float_round_nearest_even); + int roundIncrement = 0x40; + if (! roundNearestEven) { + if (roundingMode == float_round_to_zero) roundIncrement = 0; + else { + roundIncrement = 0x7F; + if (zSign) { + if (roundingMode == float_round_up) roundIncrement = 0; + } + else { + if (roundingMode == float_round_down) roundIncrement = 0; + } + } + } + int roundBits = (int)(exactAbsZ & 0x7F); + uint64_t absZ = (exactAbsZ + roundIncrement)>>7; + absZ &= ~(((roundBits ^ 0x40) == 0) & roundNearestEven); + int32_t z = (int32_t) absZ; + if (zSign) z = -z; + if ((absZ>>32) || (z && ((z < 0) ^ zSign))) { + float_raise(status, float_flag_invalid); + return (int32_t)(int32_indefinite); + } + if (roundBits) { + float_raise(status, float_flag_inexact); + if ((absZ << 7) > exactAbsZ) + set_float_rounding_up(status); + } + return z; +} + +/*---------------------------------------------------------------------------- +| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and +| `absZ1', with binary point between bits 63 and 64 (between the input words), +| and returns the properly rounded 64-bit integer corresponding to the input. +| If `zSign' is 1, the input is negated before being converted to an integer. +| Ordinarily, the fixed-point input is simply rounded to an integer, with +| the inexact exception raised if the input cannot be represented exactly as +| an integer. However, if the fixed-point input is too large, the invalid +| exception is raised and the integer indefinite value is returned. +*----------------------------------------------------------------------------*/ + +int64_t roundAndPackInt64(int zSign, uint64_t absZ0, uint64_t absZ1, float_status_t &status) +{ + int64_t z; + int roundingMode = get_float_rounding_mode(status); + int roundNearestEven = (roundingMode == float_round_nearest_even); + int increment = ((int64_t) absZ1 < 0); + if (! roundNearestEven) { + if (roundingMode == float_round_to_zero) increment = 0; + else { + if (zSign) { + increment = (roundingMode == float_round_down) && absZ1; + } + else { + increment = (roundingMode == float_round_up) && absZ1; + } + } + } + uint64_t exactAbsZ0 = absZ0; + if (increment) { + ++absZ0; + if (absZ0 == 0) goto overflow; + absZ0 &= ~(((uint64_t) (absZ1<<1) == 0) & roundNearestEven); + } + z = absZ0; + if (zSign) z = -z; + if (z && ((z < 0) ^ zSign)) { + overflow: + float_raise(status, float_flag_invalid); + return (int64_t)(int64_indefinite); + } + if (absZ1) { + float_raise(status, float_flag_inexact); + if (absZ0 > exactAbsZ0) + set_float_rounding_up(status); + } + return z; +} + +#ifdef FLOAT16 + +/*---------------------------------------------------------------------------- +| Normalizes the subnormal half-precision floating-point value represented +| by the denormalized significand `aSig'. The normalized exponent and +| significand are stored at the locations pointed to by `zExpPtr' and +| `zSigPtr', respectively. +*----------------------------------------------------------------------------*/ + +void normalizeFloat16Subnormal(uint16_t aSig, int16_t *zExpPtr, uint16_t *zSigPtr) +{ + int shiftCount = countLeadingZeros16(aSig) - 5; + *zSigPtr = aSig<> 4; + zSigRound &= ~(((roundBits ^ 0x10) == 0) & roundNearestEven); + if (zSigRound == 0) zExp = 0; + return packFloat16(zSign, zExp, zSigRound); +} + +#endif + +/*---------------------------------------------------------------------------- +| Normalizes the subnormal single-precision floating-point value represented +| by the denormalized significand `aSig'. The normalized exponent and +| significand are stored at the locations pointed to by `zExpPtr' and +| `zSigPtr', respectively. +*----------------------------------------------------------------------------*/ + +void normalizeFloat32Subnormal(uint32_t aSig, int16_t *zExpPtr, uint32_t *zSigPtr) +{ + int shiftCount = countLeadingZeros32(aSig) - 8; + *zSigPtr = aSig<> 7; + zSigRound &= ~(((roundBits ^ 0x40) == 0) & roundNearestEven); + if (zSigRound == 0) zExp = 0; + if (roundBits) { + float_raise(status, float_flag_inexact); + if ((zSigRound << 7) > zSig) set_float_rounding_up(status); + } + return packFloat32(zSign, zExp, zSigRound); +} + +/*---------------------------------------------------------------------------- +| Takes an abstract floating-point value having sign `zSign', exponent `zExp', +| and significand `zSig', and returns the proper single-precision floating- +| point value corresponding to the abstract input. This routine is just like +| `roundAndPackFloat32' except that `zSig' does not have to be normalized. +| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' +| floating-point exponent. +*----------------------------------------------------------------------------*/ + +float32 normalizeRoundAndPackFloat32(int zSign, int16_t zExp, uint32_t zSig, float_status_t &status) +{ + int shiftCount = countLeadingZeros32(zSig) - 1; + return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<>10; + zSigRound &= ~(((roundBits ^ 0x200) == 0) & roundNearestEven); + if (zSigRound == 0) zExp = 0; + if (roundBits) { + float_raise(status, float_flag_inexact); + if ((zSigRound << 10) > zSig) set_float_rounding_up(status); + } + return packFloat64(zSign, zExp, zSigRound); +} + +/*---------------------------------------------------------------------------- +| Takes an abstract floating-point value having sign `zSign', exponent `zExp', +| and significand `zSig', and returns the proper double-precision floating- +| point value corresponding to the abstract input. This routine is just like +| `roundAndPackFloat64' except that `zSig' does not have to be normalized. +| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' +| floating-point exponent. +*----------------------------------------------------------------------------*/ + +float64 normalizeRoundAndPackFloat64(int zSign, int16_t zExp, uint64_t zSig, float_status_t &status) +{ + int shiftCount = countLeadingZeros64(zSig) - 1; + return roundAndPackFloat64(zSign, zExp - shiftCount, zSig< zSigExact) set_float_rounding_up(status); + } + return packFloatx80(zSign, zExp, zSig0); + } + } + if (roundBits) float_raise(status, float_flag_inexact); + zSigExact = zSig0; + zSig0 += roundIncrement; + if (zSig0 < roundIncrement) { + // Basically scale by shifting right and keep overflow + ++zExp; + zSig0 = UINT64_C(0x8000000000000000); + zSigExact >>= 1; // must scale also, or else later tests will fail + } + roundIncrement = roundMask + 1; + if (roundNearestEven && (roundBits<<1 == roundIncrement)) + roundMask |= roundIncrement; + zSig0 &= ~roundMask; + if (zSig0 > zSigExact) set_float_rounding_up(status); + if (zSig0 == 0) zExp = 0; + return packFloatx80(zSign, zExp, zSig0); + precision80: + increment = ((int64_t) zSig1 < 0); + if (! roundNearestEven) { + if (roundingMode == float_round_to_zero) increment = 0; + else { + if (zSign) { + increment = (roundingMode == float_round_down) && zSig1; + } + else { + increment = (roundingMode == float_round_up) && zSig1; + } + } + } + if (0x7FFD <= (uint32_t) (zExp - 1)) { + if ((0x7FFE < zExp) + || ((zExp == 0x7FFE) + && (zSig0 == UINT64_C(0xFFFFFFFFFFFFFFFF)) + && increment)) + { + roundMask = 0; + overflow: + float_raise(status, float_flag_overflow | float_flag_inexact); + if ((roundingMode == float_round_to_zero) + || (zSign && (roundingMode == float_round_up)) + || (! zSign && (roundingMode == float_round_down))) + { + return packFloatx80(zSign, 0x7FFE, ~roundMask); + } + set_float_rounding_up(status); + return packFloatx80(zSign, 0x7FFF, UINT64_C(0x8000000000000000)); + } + if (zExp <= 0) { + int isTiny = (zExp < 0) || (! increment) + || (zSig0 < UINT64_C(0xFFFFFFFFFFFFFFFF)); + shift64ExtraRightJamming(zSig0, zSig1, 1 - zExp, &zSig0, &zSig1); + zExp = 0; + if (isTiny) { + if (zSig1 || (zSig0 && !float_exception_masked(status, float_flag_underflow))) + float_raise(status, float_flag_underflow); + } + if (zSig1) float_raise(status, float_flag_inexact); + if (roundNearestEven) increment = ((int64_t) zSig1 < 0); + else { + if (zSign) { + increment = (roundingMode == float_round_down) && zSig1; + } else { + increment = (roundingMode == float_round_up) && zSig1; + } + } + if (increment) { + zSigExact = zSig0++; + zSig0 &= ~(((uint64_t) (zSig1<<1) == 0) & roundNearestEven); + if (zSig0 > zSigExact) set_float_rounding_up(status); + if ((int64_t) zSig0 < 0) zExp = 1; + } + return packFloatx80(zSign, zExp, zSig0); + } + } + if (zSig1) float_raise(status, float_flag_inexact); + if (increment) { + zSigExact = zSig0++; + if (zSig0 == 0) { + zExp++; + zSig0 = UINT64_C(0x8000000000000000); + zSigExact >>= 1; // must scale also, or else later tests will fail + } + else { + zSig0 &= ~(((uint64_t) (zSig1<<1) == 0) & roundNearestEven); + } + if (zSig0 > zSigExact) set_float_rounding_up(status); + } + else { + if (zSig0 == 0) zExp = 0; + } + return packFloatx80(zSign, zExp, zSig0); +} + +floatx80 roundAndPackFloatx80(int roundingPrecision, + int zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1, float_status_t &status) +{ + float_status_t round_status = status; + floatx80 result = SoftFloatRoundAndPackFloatx80(roundingPrecision, zSign, zExp, zSig0, zSig1, status); + + // bias unmasked undeflow + if (status.float_exception_flags & ~status.float_exception_masks & float_flag_underflow) { + float_raise(round_status, float_flag_underflow); + return SoftFloatRoundAndPackFloatx80(roundingPrecision, zSign, zExp + 0x6000, zSig0, zSig1, status = round_status); + } + + // bias unmasked overflow + if (status.float_exception_flags & ~status.float_exception_masks & float_flag_overflow) { + float_raise(round_status, float_flag_overflow); + return SoftFloatRoundAndPackFloatx80(roundingPrecision, zSign, zExp - 0x6000, zSig0, zSig1, status = round_status); + } + + return result; +} + +/*---------------------------------------------------------------------------- +| Takes an abstract floating-point value having sign `zSign', exponent +| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', +| and returns the proper extended double-precision floating-point value +| corresponding to the abstract input. This routine is just like +| `roundAndPackFloatx80' except that the input significand does not have to be +| normalized. +*----------------------------------------------------------------------------*/ + +floatx80 normalizeRoundAndPackFloatx80(int roundingPrecision, + int zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1, float_status_t &status) +{ + if (zSig0 == 0) { + zSig0 = zSig1; + zSig1 = 0; + zExp -= 64; + } + int shiftCount = countLeadingZeros64(zSig0); + shortShift128Left(zSig0, zSig1, shiftCount, &zSig0, &zSig1); + zExp -= shiftCount; + return + roundAndPackFloatx80(roundingPrecision, zSign, zExp, zSig0, zSig1, status); +} + +#endif + +#ifdef FLOAT128 + +/*---------------------------------------------------------------------------- +| Normalizes the subnormal quadruple-precision floating-point value +| represented by the denormalized significand formed by the concatenation of +| `aSig0' and `aSig1'. The normalized exponent is stored at the location +| pointed to by `zExpPtr'. The most significant 49 bits of the normalized +| significand are stored at the location pointed to by `zSig0Ptr', and the +| least significant 64 bits of the normalized significand are stored at the +| location pointed to by `zSig1Ptr'. +*----------------------------------------------------------------------------*/ + +void normalizeFloat128Subnormal( + uint64_t aSig0, uint64_t aSig1, int32_t *zExpPtr, uint64_t *zSig0Ptr, uint64_t *zSig1Ptr) +{ + int shiftCount; + + if (aSig0 == 0) { + shiftCount = countLeadingZeros64(aSig1) - 15; + if (shiftCount < 0) { + *zSig0Ptr = aSig1 >>(-shiftCount); + *zSig1Ptr = aSig1 << (shiftCount & 63); + } + else { + *zSig0Ptr = aSig1 << shiftCount; + *zSig1Ptr = 0; + } + *zExpPtr = - shiftCount - 63; + } + else { + shiftCount = countLeadingZeros64(aSig0) - 15; + shortShift128Left(aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr); + *zExpPtr = 1 - shiftCount; + } +} + +/*---------------------------------------------------------------------------- +| Takes an abstract floating-point value having sign `zSign', exponent `zExp', +| and extended significand formed by the concatenation of `zSig0', `zSig1', +| and `zSig2', and returns the proper quadruple-precision floating-point value +| corresponding to the abstract input. Ordinarily, the abstract value is +| simply rounded and packed into the quadruple-precision format, with the +| inexact exception raised if the abstract input cannot be represented +| exactly. However, if the abstract value is too large, the overflow and +| inexact exceptions are raised and an infinity or maximal finite value is +| returned. If the abstract value is too small, the input value is rounded to +| a subnormal number, and the underflow and inexact exceptions are raised if +| the abstract input cannot be represented exactly as a subnormal quadruple- +| precision floating-point number. +| The input significand must be normalized or smaller. If the input +| significand is not normalized, `zExp' must be 0; in that case, the result +| returned is a subnormal number, and it must not require rounding. In the +| usual case that the input significand is normalized, `zExp' must be 1 less +| than the ``true'' floating-point exponent. The handling of underflow and +| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float128 roundAndPackFloat128( + int zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1, uint64_t zSig2, float_status_t &status) +{ + int increment = ((int64_t) zSig2 < 0); + if (0x7FFD <= (uint32_t) zExp) { + if ((0x7FFD < zExp) + || ((zExp == 0x7FFD) + && eq128(UINT64_C(0x0001FFFFFFFFFFFF), + UINT64_C(0xFFFFFFFFFFFFFFFF), zSig0, zSig1) + && increment)) + { + float_raise(status, float_flag_overflow | float_flag_inexact); + return packFloat128(zSign, 0x7FFF, 0, 0); + } + if (zExp < 0) { + int isTiny = (zExp < -1) + || ! increment + || lt128(zSig0, zSig1, + UINT64_C(0x0001FFFFFFFFFFFF), + UINT64_C(0xFFFFFFFFFFFFFFFF)); + shift128ExtraRightJamming( + zSig0, zSig1, zSig2, -zExp, &zSig0, &zSig1, &zSig2); + zExp = 0; + if (isTiny && zSig2) float_raise(status, float_flag_underflow); + increment = ((int64_t) zSig2 < 0); + } + } + if (zSig2) float_raise(status, float_flag_inexact); + if (increment) { + add128(zSig0, zSig1, 0, 1, &zSig0, &zSig1); + zSig1 &= ~((zSig2 + zSig2 == 0) & 1); + } + else { + if ((zSig0 | zSig1) == 0) zExp = 0; + } + return packFloat128(zSign, zExp, zSig0, zSig1); +} + +/*---------------------------------------------------------------------------- +| Takes an abstract floating-point value having sign `zSign', exponent `zExp', +| and significand formed by the concatenation of `zSig0' and `zSig1', and +| returns the proper quadruple-precision floating-point value corresponding +| to the abstract input. This routine is just like `roundAndPackFloat128' +| except that the input significand has fewer bits and does not have to be +| normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- +| point exponent. +*----------------------------------------------------------------------------*/ + +float128 normalizeRoundAndPackFloat128( + int zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1, float_status_t &status) +{ + uint64_t zSig2; + + if (zSig0 == 0) { + zSig0 = zSig1; + zSig1 = 0; + zExp -= 64; + } + int shiftCount = countLeadingZeros64(zSig0) - 15; + if (0 <= shiftCount) { + zSig2 = 0; + shortShift128Left(zSig0, zSig1, shiftCount, &zSig0, &zSig1); + } + else { + shift128ExtraRightJamming( + zSig0, zSig1, 0, -shiftCount, &zSig0, &zSig1, &zSig2); + } + zExp -= shiftCount; + return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); +} + +END_SOFTFLOAT_NS + +#endif diff -r 372d3611c693 -r e89a7cf22da3 ext/softfloat/softfloat-specialize.hh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ext/softfloat/softfloat-specialize.hh Thu Sep 19 18:09:15 2013 +0200 @@ -0,0 +1,764 @@ +/*============================================================================ +This C source fragment is part of the SoftFloat IEC/IEEE Floating-point +Arithmetic Package, Release 2b. + +Written by John R. Hauser. This work was made possible in part by the +International Computer Science Institute, located at Suite 600, 1947 Center +Street, Berkeley, California 94704. Funding was partially provided by the +National Science Foundation under grant MIP-9311980. The original version +of this code was written as part of a project to build a fixed-point vector +processor in collaboration with the University of California at Berkeley, +overseen by Profs. Nelson Morgan and John Wawrzynek. More information +is available through the Web page `http://www.cs.berkeley.edu/~jhauser/ +arithmetic/SoftFloat.html'. + +THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has +been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES +RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS +AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES, +COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE +EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE +INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR +OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE. + +Derivative works are acceptable, even for commercial purposes, so long as +(1) the source code for the derivative work includes prominent notice that +the work is derivative, and (2) the source code includes prominent notice with +these four paragraphs for those parts of this code that are retained. +=============================================================================*/ + +#ifndef _SOFTFLOAT_SPECIALIZE_H_ +#define _SOFTFLOAT_SPECIALIZE_H_ + +#include "softfloat.hh" +#include "softfloat-macros.hh" + +BEGIN_SOFTFLOAT_NS + +/*============================================================================ + * Adapted for Bochs (x86 achitecture simulator) by + * Stanislav Shwartsman [sshwarts at sourceforge net] + * ==========================================================================*/ + +#define int16_indefinite ((int16_t)0x8000) +#define int32_indefinite ((int32_t)0x80000000) +#define int64_indefinite UINT64_C(0x8000000000000000) + +/*---------------------------------------------------------------------------- +| Internal canonical NaN format. +*----------------------------------------------------------------------------*/ + +typedef struct { + int sign; + uint64_t hi, lo; +} commonNaNT; + +#ifdef FLOAT16 + +/*---------------------------------------------------------------------------- +| The pattern for a default generated half-precision NaN. +*----------------------------------------------------------------------------*/ +#define float16_default_nan 0xFE00 + +#define float16_fraction extractFloat16Frac +#define float16_exp extractFloat16Exp +#define float16_sign extractFloat16Sign + +/*---------------------------------------------------------------------------- +| Returns the fraction bits of the half-precision floating-point value `a'. +*----------------------------------------------------------------------------*/ + +inline uint16_t extractFloat16Frac(float16 a) +{ + return a & 0x3FF; +} + +/*---------------------------------------------------------------------------- +| Returns the exponent bits of the half-precision floating-point value `a'. +*----------------------------------------------------------------------------*/ + +inline int16_t extractFloat16Exp(float16 a) +{ + return (a>>10) & 0x1F; +} + +/*---------------------------------------------------------------------------- +| Returns the sign bit of the half-precision floating-point value `a'. +*----------------------------------------------------------------------------*/ + +inline int extractFloat16Sign(float16 a) +{ + return a>>15; +} + +/*---------------------------------------------------------------------------- +| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a +| single-precision floating-point value, returning the result. After being +| shifted into the proper positions, the three fields are simply added +| together to form the result. This means that any integer portion of `zSig' +| will be added into the exponent. Since a properly normalized significand +| will have an integer portion equal to 1, the `zExp' input should be 1 less +| than the desired result exponent whenever `zSig' is a complete, normalized +| significand. +*----------------------------------------------------------------------------*/ + +inline float16 packFloat16(int zSign, int zExp, uint16_t zSig) +{ + return (((uint16_t) zSign)<<15) + (((uint16_t) zExp)<<10) + zSig; +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the half-precision floating-point value `a' is a NaN; +| otherwise returns 0. +*----------------------------------------------------------------------------*/ + +inline int float16_is_nan(float16 a) +{ + return (0xF800 < (uint16_t) (a<<1)); +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the half-precision floating-point value `a' is a signaling +| NaN; otherwise returns 0. +*----------------------------------------------------------------------------*/ + +inline int float16_is_signaling_nan(float16 a) +{ + return (((a>>9) & 0x3F) == 0x3E) && (a & 0x1FF); +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the half-precision floating-point value `a' is denormal; +| otherwise returns 0. +*----------------------------------------------------------------------------*/ + +inline int float16_is_denormal(float16 a) +{ + return (extractFloat16Exp(a) == 0) && (extractFloat16Frac(a) != 0); +} + +/*---------------------------------------------------------------------------- +| Convert float16 denormals to zero. +*----------------------------------------------------------------------------*/ + +inline float16 float16_denormal_to_zero(float16 a) +{ + if (float16_is_denormal(a)) a &= 0x8000; + return a; +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the half-precision floating-point NaN +| `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid +| exception is raised. +*----------------------------------------------------------------------------*/ + +inline commonNaNT float16ToCommonNaN(float16 a, float_status_t &status) +{ + commonNaNT z; + if (float16_is_signaling_nan(a)) float_raise(status, float_flag_invalid); + z.sign = a>>15; + z.lo = 0; + z.hi = ((uint64_t) a)<<54; + return z; +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the canonical NaN `a' to the half- +| precision floating-point format. +*----------------------------------------------------------------------------*/ + +inline float16 commonNaNToFloat16(commonNaNT a) +{ + return (((uint16_t) a.sign)<<15) | 0x7E00 | (uint16_t)(a.hi>>54); +} + +#endif + +/*---------------------------------------------------------------------------- +| The pattern for a default generated single-precision NaN. +*----------------------------------------------------------------------------*/ +#define float32_default_nan 0xFFC00000 + +#define float32_fraction extractFloat32Frac +#define float32_exp extractFloat32Exp +#define float32_sign extractFloat32Sign + +/*---------------------------------------------------------------------------- +| Returns the fraction bits of the single-precision floating-point value `a'. +*----------------------------------------------------------------------------*/ + +inline uint32_t extractFloat32Frac(float32 a) +{ + return a & 0x007FFFFF; +} + +/*---------------------------------------------------------------------------- +| Returns the exponent bits of the single-precision floating-point value `a'. +*----------------------------------------------------------------------------*/ + +inline int16_t extractFloat32Exp(float32 a) +{ + return (a>>23) & 0xFF; +} + +/*---------------------------------------------------------------------------- +| Returns the sign bit of the single-precision floating-point value `a'. +*----------------------------------------------------------------------------*/ + +inline int extractFloat32Sign(float32 a) +{ + return a>>31; +} + +/*---------------------------------------------------------------------------- +| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a +| single-precision floating-point value, returning the result. After being +| shifted into the proper positions, the three fields are simply added +| together to form the result. This means that any integer portion of `zSig' +| will be added into the exponent. Since a properly normalized significand +| will have an integer portion equal to 1, the `zExp' input should be 1 less +| than the desired result exponent whenever `zSig' is a complete, normalized +| significand. +*----------------------------------------------------------------------------*/ + +inline float32 packFloat32(int zSign, int16_t zExp, uint32_t zSig) +{ + return (((uint32_t) zSign)<<31) + (((uint32_t) zExp)<<23) + zSig; +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the single-precision floating-point value `a' is a NaN; +| otherwise returns 0. +*----------------------------------------------------------------------------*/ + +inline int float32_is_nan(float32 a) +{ + return (0xFF000000 < (uint32_t) (a<<1)); +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the single-precision floating-point value `a' is a signaling +| NaN; otherwise returns 0. +*----------------------------------------------------------------------------*/ + +inline int float32_is_signaling_nan(float32 a) +{ + return (((a>>22) & 0x1FF) == 0x1FE) && (a & 0x003FFFFF); +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the single-precision floating-point value `a' is denormal; +| otherwise returns 0. +*----------------------------------------------------------------------------*/ + +inline int float32_is_denormal(float32 a) +{ + return (extractFloat32Exp(a) == 0) && (extractFloat32Frac(a) != 0); +} + +/*---------------------------------------------------------------------------- +| Convert float32 denormals to zero. +*----------------------------------------------------------------------------*/ + +inline float32 float32_denormal_to_zero(float32 a) +{ + if (float32_is_denormal(a)) a &= 0x80000000; + return a; +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the single-precision floating-point NaN +| `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid +| exception is raised. +*----------------------------------------------------------------------------*/ + +inline commonNaNT float32ToCommonNaN(float32 a, float_status_t &status) +{ + commonNaNT z; + if (float32_is_signaling_nan(a)) float_raise(status, float_flag_invalid); + z.sign = a>>31; + z.lo = 0; + z.hi = ((uint64_t) a)<<41; + return z; +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the canonical NaN `a' to the single- +| precision floating-point format. +*----------------------------------------------------------------------------*/ + +inline float32 commonNaNToFloat32(commonNaNT a) +{ + return (((uint32_t) a.sign)<<31) | 0x7FC00000 | (uint32_t)(a.hi>>41); +} + +/*---------------------------------------------------------------------------- +| Takes two single-precision floating-point values `a' and `b', one of which +| is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a +| signaling NaN, the invalid exception is raised. +*----------------------------------------------------------------------------*/ + +float32 propagateFloat32NaN(float32 a, float32 b, float_status_t &status); + +/*---------------------------------------------------------------------------- +| Takes single-precision floating-point NaN `a' and returns the appropriate +| NaN result. If `a' is a signaling NaN, the invalid exception is raised. +*----------------------------------------------------------------------------*/ + +inline float32 propagateFloat32NaN(float32 a, float_status_t &status) +{ + if (float32_is_signaling_nan(a)) + float_raise(status, float_flag_invalid); + + return a | 0x00400000; +} + +/*---------------------------------------------------------------------------- +| The pattern for a default generated double-precision NaN. +*----------------------------------------------------------------------------*/ +#define float64_default_nan UINT64_C(0xFFF8000000000000) + +#define float64_fraction extractFloat64Frac +#define float64_exp extractFloat64Exp +#define float64_sign extractFloat64Sign + +/*---------------------------------------------------------------------------- +| Returns the fraction bits of the double-precision floating-point value `a'. +*----------------------------------------------------------------------------*/ + +inline uint64_t extractFloat64Frac(float64 a) +{ + return a & UINT64_C(0x000FFFFFFFFFFFFF); +} + +/*---------------------------------------------------------------------------- +| Returns the exponent bits of the double-precision floating-point value `a'. +*----------------------------------------------------------------------------*/ + +inline int16_t extractFloat64Exp(float64 a) +{ + return (int16_t)(a>>52) & 0x7FF; +} + +/*---------------------------------------------------------------------------- +| Returns the sign bit of the double-precision floating-point value `a'. +*----------------------------------------------------------------------------*/ + +inline int extractFloat64Sign(float64 a) +{ + return (int)(a>>63); +} + +/*---------------------------------------------------------------------------- +| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a +| double-precision floating-point value, returning the result. After being +| shifted into the proper positions, the three fields are simply added +| together to form the result. This means that any integer portion of `zSig' +| will be added into the exponent. Since a properly normalized significand +| will have an integer portion equal to 1, the `zExp' input should be 1 less +| than the desired result exponent whenever `zSig' is a complete, normalized +| significand. +*----------------------------------------------------------------------------*/ + +inline float64 packFloat64(int zSign, int16_t zExp, uint64_t zSig) +{ + return (((uint64_t) zSign)<<63) + (((uint64_t) zExp)<<52) + zSig; +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the double-precision floating-point value `a' is a NaN; +| otherwise returns 0. +*----------------------------------------------------------------------------*/ + +inline int float64_is_nan(float64 a) +{ + return (UINT64_C(0xFFE0000000000000) < (uint64_t) (a<<1)); +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the double-precision floating-point value `a' is a signaling +| NaN; otherwise returns 0. +*----------------------------------------------------------------------------*/ + +inline int float64_is_signaling_nan(float64 a) +{ + return (((a>>51) & 0xFFF) == 0xFFE) && (a & UINT64_C(0x0007FFFFFFFFFFFF)); +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the double-precision floating-point value `a' is denormal; +| otherwise returns 0. +*----------------------------------------------------------------------------*/ + +inline int float64_is_denormal(float64 a) +{ + return (extractFloat64Exp(a) == 0) && (extractFloat64Frac(a) != 0); +} + +/*---------------------------------------------------------------------------- +| Convert float64 denormals to zero. +*----------------------------------------------------------------------------*/ + +inline float64 float64_denormal_to_zero(float64 a) +{ + if (float64_is_denormal(a)) a &= ((uint64_t)(1) << 63); + return a; +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the double-precision floating-point NaN +| `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid +| exception is raised. +*----------------------------------------------------------------------------*/ + +inline commonNaNT float64ToCommonNaN(float64 a, float_status_t &status) +{ + commonNaNT z; + if (float64_is_signaling_nan(a)) float_raise(status, float_flag_invalid); + z.sign = (int)(a>>63); + z.lo = 0; + z.hi = a<<12; + return z; +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the canonical NaN `a' to the double- +| precision floating-point format. +*----------------------------------------------------------------------------*/ + +inline float64 commonNaNToFloat64(commonNaNT a) +{ + return (((uint64_t) a.sign)<<63) | UINT64_C(0x7FF8000000000000) | (a.hi>>12); +} + +/*---------------------------------------------------------------------------- +| Takes two double-precision floating-point values `a' and `b', one of which +| is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a +| signaling NaN, the invalid exception is raised. +*----------------------------------------------------------------------------*/ + +float64 propagateFloat64NaN(float64 a, float64 b, float_status_t &status); + +/*---------------------------------------------------------------------------- +| Takes double-precision floating-point NaN `a' and returns the appropriate +| NaN result. If `a' is a signaling NaN, the invalid exception is raised. +*----------------------------------------------------------------------------*/ + +inline float64 propagateFloat64NaN(float64 a, float_status_t &status) +{ + if (float64_is_signaling_nan(a)) + float_raise(status, float_flag_invalid); + + return a | UINT64_C(0x0008000000000000); +} + +#ifdef FLOATX80 + +/*---------------------------------------------------------------------------- +| The pattern for a default generated extended double-precision NaN. The +| `high' and `low' values hold the most- and least-significant bits, +| respectively. +*----------------------------------------------------------------------------*/ +#define floatx80_default_nan_exp 0xFFFF +#define floatx80_default_nan_fraction UINT64_C(0xC000000000000000) + +#define floatx80_fraction extractFloatx80Frac +#define floatx80_exp extractFloatx80Exp +#define floatx80_sign extractFloatx80Sign + +#define EXP_BIAS 0x3FFF + +/*---------------------------------------------------------------------------- +| Returns the fraction bits of the extended double-precision floating-point +| value `a'. +*----------------------------------------------------------------------------*/ + +inline uint64_t extractFloatx80Frac(floatx80 a) +{ + return a.fraction; +} + +/*---------------------------------------------------------------------------- +| Returns the exponent bits of the extended double-precision floating-point +| value `a'. +*----------------------------------------------------------------------------*/ + +inline int32_t extractFloatx80Exp(floatx80 a) +{ + return a.exp & 0x7FFF; +} + +/*---------------------------------------------------------------------------- +| Returns the sign bit of the extended double-precision floating-point value +| `a'. +*----------------------------------------------------------------------------*/ + +inline int extractFloatx80Sign(floatx80 a) +{ + return a.exp>>15; +} + +/*---------------------------------------------------------------------------- +| Packs the sign `zSign', exponent `zExp', and significand `zSig' into an +| extended double-precision floating-point value, returning the result. +*----------------------------------------------------------------------------*/ + +inline floatx80 packFloatx80(int zSign, int32_t zExp, uint64_t zSig) +{ + floatx80 z; + z.fraction = zSig; + z.exp = (zSign << 15) + zExp; + return z; +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the extended double-precision floating-point value `a' is a +| NaN; otherwise returns 0. +*----------------------------------------------------------------------------*/ + +inline int floatx80_is_nan(floatx80 a) +{ + return ((a.exp & 0x7FFF) == 0x7FFF) && (int64_t) (a.fraction<<1); +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the extended double-precision floating-point value `a' is a +| signaling NaN; otherwise returns 0. +*----------------------------------------------------------------------------*/ + +inline int floatx80_is_signaling_nan(floatx80 a) +{ + uint64_t aLow = a.fraction & ~UINT64_C(0x4000000000000000); + return ((a.exp & 0x7FFF) == 0x7FFF) && + ((uint64_t) (aLow<<1)) && (a.fraction == aLow); +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the extended double-precision floating-point value `a' is an +| unsupported; otherwise returns 0. +*----------------------------------------------------------------------------*/ + +inline int floatx80_is_unsupported(floatx80 a) +{ + return ((a.exp & 0x7FFF) && !(a.fraction & UINT64_C(0x8000000000000000))); +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the extended double-precision floating- +| point NaN `a' to the canonical NaN format. If `a' is a signaling NaN, the +| invalid exception is raised. +*----------------------------------------------------------------------------*/ + +inline commonNaNT floatx80ToCommonNaN(floatx80 a, float_status_t &status) +{ + commonNaNT z; + if (floatx80_is_signaling_nan(a)) float_raise(status, float_flag_invalid); + z.sign = a.exp >> 15; + z.lo = 0; + z.hi = a.fraction << 1; + return z; +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the canonical NaN `a' to the extended +| double-precision floating-point format. +*----------------------------------------------------------------------------*/ + +inline floatx80 commonNaNToFloatx80(commonNaNT a) +{ + floatx80 z; + z.fraction = UINT64_C(0xC000000000000000) | (a.hi>>1); + z.exp = (((uint16_t) a.sign)<<15) | 0x7FFF; + return z; +} + +/*---------------------------------------------------------------------------- +| Takes two extended double-precision floating-point values `a' and `b', one +| of which is a NaN, and returns the appropriate NaN result. If either `a' or +| `b' is a signaling NaN, the invalid exception is raised. +*----------------------------------------------------------------------------*/ + +floatx80 propagateFloatx80NaN(floatx80 a, floatx80 b, float_status_t &status); + +/*---------------------------------------------------------------------------- +| Takes extended double-precision floating-point NaN `a' and returns the +| appropriate NaN result. If `a' is a signaling NaN, the invalid exception +| is raised. +*----------------------------------------------------------------------------*/ + +inline floatx80 propagateFloatx80NaN(floatx80 a, float_status_t &status) +{ + if (floatx80_is_signaling_nan(a)) + float_raise(status, float_flag_invalid); + + a.fraction |= UINT64_C(0xC000000000000000); + + return a; +} + +/*---------------------------------------------------------------------------- +| The pattern for a default generated extended double-precision NaN. +*----------------------------------------------------------------------------*/ +extern const floatx80 floatx80_default_nan; + +#endif /* FLOATX80 */ + +#ifdef FLOAT128 + +/*---------------------------------------------------------------------------- +| The pattern for a default generated quadruple-precision NaN. The `high' and +| `low' values hold the most- and least-significant bits, respectively. +*----------------------------------------------------------------------------*/ +#define float128_default_nan_hi UINT64_C(0xFFFF800000000000) +#define float128_default_nan_lo UINT64_C(0x0000000000000000) + +#define float128_exp extractFloat128Exp + +/*---------------------------------------------------------------------------- +| Returns the least-significant 64 fraction bits of the quadruple-precision +| floating-point value `a'. +*----------------------------------------------------------------------------*/ + +inline uint64_t extractFloat128Frac1(float128 a) +{ + return a.lo; +} + +/*---------------------------------------------------------------------------- +| Returns the most-significant 48 fraction bits of the quadruple-precision +| floating-point value `a'. +*----------------------------------------------------------------------------*/ + +inline uint64_t extractFloat128Frac0(float128 a) +{ + return a.hi & UINT64_C(0x0000FFFFFFFFFFFF); +} + +/*---------------------------------------------------------------------------- +| Returns the exponent bits of the quadruple-precision floating-point value +| `a'. +*----------------------------------------------------------------------------*/ + +inline int32_t extractFloat128Exp(float128 a) +{ + return ((int32_t)(a.hi>>48)) & 0x7FFF; +} + +/*---------------------------------------------------------------------------- +| Returns the sign bit of the quadruple-precision floating-point value `a'. +*----------------------------------------------------------------------------*/ + +inline int extractFloat128Sign(float128 a) +{ + return (int)(a.hi >> 63); +} + +/*---------------------------------------------------------------------------- +| Packs the sign `zSign', the exponent `zExp', and the significand formed +| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision +| floating-point value, returning the result. After being shifted into the +| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply +| added together to form the most significant 32 bits of the result. This +| means that any integer portion of `zSig0' will be added into the exponent. +| Since a properly normalized significand will have an integer portion equal +| to 1, the `zExp' input should be 1 less than the desired result exponent +| whenever `zSig0' and `zSig1' concatenated form a complete, normalized +| significand. +*----------------------------------------------------------------------------*/ + +inline float128 packFloat128(int zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1) +{ + float128 z; + z.lo = zSig1; + z.hi = (((uint64_t) zSign)<<63) + (((uint64_t) zExp)<<48) + zSig0; + return z; +} + +/*---------------------------------------------------------------------------- +| Packs two 64-bit precision integers into into the quadruple-precision +| floating-point value, returning the result. +*----------------------------------------------------------------------------*/ + +inline float128 packFloat128(uint64_t zHi, uint64_t zLo) +{ + float128 z; + z.lo = zLo; + z.hi = zHi; + return z; +} + +#ifdef _MSC_VER +#define PACK_FLOAT_128(hi,lo) { lo, hi } +#else +#define PACK_FLOAT_128(hi,lo) packFloat128(UINT64_C(hi),UINT64_C(lo)) +#endif + +/*---------------------------------------------------------------------------- +| Returns 1 if the quadruple-precision floating-point value `a' is a NaN; +| otherwise returns 0. +*----------------------------------------------------------------------------*/ + +inline int float128_is_nan(float128 a) +{ + return (UINT64_C(0xFFFE000000000000) <= (uint64_t) (a.hi<<1)) + && (a.lo || (a.hi & UINT64_C(0x0000FFFFFFFFFFFF))); +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the quadruple-precision floating-point value `a' is a +| signaling NaN; otherwise returns 0. +*----------------------------------------------------------------------------*/ + +inline int float128_is_signaling_nan(float128 a) +{ + return (((a.hi>>47) & 0xFFFF) == 0xFFFE) + && (a.lo || (a.hi & UINT64_C(0x00007FFFFFFFFFFF))); +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the quadruple-precision floating-point NaN +| `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid +| exception is raised. +*----------------------------------------------------------------------------*/ + +inline commonNaNT float128ToCommonNaN(float128 a, float_status_t &status) +{ + commonNaNT z; + if (float128_is_signaling_nan(a)) float_raise(status, float_flag_invalid); + z.sign = (int)(a.hi>>63); + shortShift128Left(a.hi, a.lo, 16, &z.hi, &z.lo); + return z; +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the canonical NaN `a' to the quadruple- +| precision floating-point format. +*----------------------------------------------------------------------------*/ + +inline float128 commonNaNToFloat128(commonNaNT a) +{ + float128 z; + shift128Right(a.hi, a.lo, 16, &z.hi, &z.lo); + z.hi |= (((uint64_t) a.sign)<<63) | UINT64_C(0x7FFF800000000000); + return z; +} + +/*---------------------------------------------------------------------------- +| Takes two quadruple-precision floating-point values `a' and `b', one of +| which is a NaN, and returns the appropriate NaN result. If either `a' or +| `b' is a signaling NaN, the invalid exception is raised. +*----------------------------------------------------------------------------*/ + +float128 propagateFloat128NaN(float128 a, float128 b, float_status_t &status); + +/*---------------------------------------------------------------------------- +| The pattern for a default generated quadruple-precision NaN. +*----------------------------------------------------------------------------*/ +extern const float128 float128_default_nan; + +#endif /* FLOAT128 */ + +END_SOFTFLOAT_NS + +#endif diff -r 372d3611c693 -r e89a7cf22da3 ext/softfloat/softfloat-specialize.cc --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ext/softfloat/softfloat-specialize.cc Thu Sep 19 18:09:15 2013 +0200 @@ -0,0 +1,197 @@ +/*============================================================================ +This C source fragment is part of the SoftFloat IEC/IEEE Floating-point +Arithmetic Package, Release 2b. + +Written by John R. Hauser. This work was made possible in part by the +International Computer Science Institute, located at Suite 600, 1947 Center +Street, Berkeley, California 94704. Funding was partially provided by the +National Science Foundation under grant MIP-9311980. The original version +of this code was written as part of a project to build a fixed-point vector +processor in collaboration with the University of California at Berkeley, +overseen by Profs. Nelson Morgan and John Wawrzynek. More information +is available through the Web page `http://www.cs.berkeley.edu/~jhauser/ +arithmetic/SoftFloat.html'. + +THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has +been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES +RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS +AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES, +COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE +EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE +INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR +OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE. + +Derivative works are acceptable, even for commercial purposes, so long as +(1) the source code for the derivative work includes prominent notice that +the work is derivative, and (2) the source code includes prominent notice with +these four paragraphs for those parts of this code that are retained. +=============================================================================*/ + +#define FLOAT128 + +/*============================================================================ + * Adapted for Bochs (x86 achitecture simulator) by + * Stanislav Shwartsman [sshwarts at sourceforge net] + * ==========================================================================*/ + +#include "softfloat.hh" +#include "softfloat-specialize.hh" + +BEGIN_SOFTFLOAT_NS + +/*---------------------------------------------------------------------------- +| Takes two single-precision floating-point values `a' and `b', one of which +| is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a +| signaling NaN, the invalid exception is raised. +*----------------------------------------------------------------------------*/ + +float32 propagateFloat32NaN(float32 a, float32 b, float_status_t &status) +{ + int aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN; + + aIsNaN = float32_is_nan(a); + aIsSignalingNaN = float32_is_signaling_nan(a); + bIsNaN = float32_is_nan(b); + bIsSignalingNaN = float32_is_signaling_nan(b); + a |= 0x00400000; + b |= 0x00400000; + if (aIsSignalingNaN | bIsSignalingNaN) float_raise(status, float_flag_invalid); + if (get_float_nan_handling_mode(status) == float_larger_significand_nan) { + if (aIsSignalingNaN) { + if (bIsSignalingNaN) goto returnLargerSignificand; + return bIsNaN ? b : a; + } + else if (aIsNaN) { + if (bIsSignalingNaN | ! bIsNaN) return a; + returnLargerSignificand: + if ((uint32_t) (a<<1) < (uint32_t) (b<<1)) return b; + if ((uint32_t) (b<<1) < (uint32_t) (a<<1)) return a; + return (a < b) ? a : b; + } + else { + return b; + } + } else { + return (aIsSignalingNaN | aIsNaN) ? a : b; + } +} + +/*---------------------------------------------------------------------------- +| Takes two double-precision floating-point values `a' and `b', one of which +| is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a +| signaling NaN, the invalid exception is raised. +*----------------------------------------------------------------------------*/ + +float64 propagateFloat64NaN(float64 a, float64 b, float_status_t &status) +{ + int aIsNaN = float64_is_nan(a); + int aIsSignalingNaN = float64_is_signaling_nan(a); + int bIsNaN = float64_is_nan(b); + int bIsSignalingNaN = float64_is_signaling_nan(b); + a |= UINT64_C(0x0008000000000000); + b |= UINT64_C(0x0008000000000000); + if (aIsSignalingNaN | bIsSignalingNaN) float_raise(status, float_flag_invalid); + if (get_float_nan_handling_mode(status) == float_larger_significand_nan) { + if (aIsSignalingNaN) { + if (bIsSignalingNaN) goto returnLargerSignificand; + return bIsNaN ? b : a; + } + else if (aIsNaN) { + if (bIsSignalingNaN | ! bIsNaN) return a; + returnLargerSignificand: + if ((uint64_t) (a<<1) < (uint64_t) (b<<1)) return b; + if ((uint64_t) (b<<1) < (uint64_t) (a<<1)) return a; + return (a < b) ? a : b; + } + else { + return b; + } + } else { + return (aIsSignalingNaN | aIsNaN) ? a : b; + } +} + +#ifdef FLOATX80 + +/*---------------------------------------------------------------------------- +| Takes two extended double-precision floating-point values `a' and `b', one +| of which is a NaN, and returns the appropriate NaN result. If either `a' or +| `b' is a signaling NaN, the invalid exception is raised. +*----------------------------------------------------------------------------*/ + +floatx80 propagateFloatx80NaN(floatx80 a, floatx80 b, float_status_t &status) +{ + int aIsNaN = floatx80_is_nan(a); + int aIsSignalingNaN = floatx80_is_signaling_nan(a); + int bIsNaN = floatx80_is_nan(b); + int bIsSignalingNaN = floatx80_is_signaling_nan(b); + a.fraction |= UINT64_C(0xC000000000000000); + b.fraction |= UINT64_C(0xC000000000000000); + if (aIsSignalingNaN | bIsSignalingNaN) float_raise(status, float_flag_invalid); + if (aIsSignalingNaN) { + if (bIsSignalingNaN) goto returnLargerSignificand; + return bIsNaN ? b : a; + } + else if (aIsNaN) { + if (bIsSignalingNaN | ! bIsNaN) return a; + returnLargerSignificand: + if (a.fraction < b.fraction) return b; + if (b.fraction < a.fraction) return a; + return (a.exp < b.exp) ? a : b; + } + else { + return b; + } +} + +/*---------------------------------------------------------------------------- +| The pattern for a default generated extended double-precision NaN. +*----------------------------------------------------------------------------*/ +const floatx80 floatx80_default_nan = + packFloatx80(0, floatx80_default_nan_exp, floatx80_default_nan_fraction); + +#endif /* FLOATX80 */ + +#ifdef FLOAT128 + +/*---------------------------------------------------------------------------- +| Takes two quadruple-precision floating-point values `a' and `b', one of +| which is a NaN, and returns the appropriate NaN result. If either `a' or +| `b' is a signaling NaN, the invalid exception is raised. +*----------------------------------------------------------------------------*/ + +float128 propagateFloat128NaN(float128 a, float128 b, float_status_t &status) +{ + int aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN; + aIsNaN = float128_is_nan(a); + aIsSignalingNaN = float128_is_signaling_nan(a); + bIsNaN = float128_is_nan(b); + bIsSignalingNaN = float128_is_signaling_nan(b); + a.hi |= UINT64_C(0x0000800000000000); + b.hi |= UINT64_C(0x0000800000000000); + if (aIsSignalingNaN | bIsSignalingNaN) float_raise(status, float_flag_invalid); + if (aIsSignalingNaN) { + if (bIsSignalingNaN) goto returnLargerSignificand; + return bIsNaN ? b : a; + } + else if (aIsNaN) { + if (bIsSignalingNaN | !bIsNaN) return a; + returnLargerSignificand: + if (lt128(a.hi<<1, a.lo, b.hi<<1, b.lo)) return b; + if (lt128(b.hi<<1, b.lo, a.hi<<1, a.lo)) return a; + return (a.hi < b.hi) ? a : b; + } + else { + return b; + } +} + +/*---------------------------------------------------------------------------- +| The pattern for a default generated quadruple-precision NaN. +*----------------------------------------------------------------------------*/ +const float128 float128_default_nan = + packFloat128(float128_default_nan_hi, float128_default_nan_lo); + +END_SOFTFLOAT_NS + +#endif /* FLOAT128 */ diff -r 372d3611c693 -r e89a7cf22da3 ext/softfloat/softfloat.hh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ext/softfloat/softfloat.hh Thu Sep 19 18:09:15 2013 +0200 @@ -0,0 +1,450 @@ +/*============================================================================ +This C header file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic +Package, Release 2b. + +Written by John R. Hauser. This work was made possible in part by the +International Computer Science Institute, located at Suite 600, 1947 Center +Street, Berkeley, California 94704. Funding was partially provided by the +National Science Foundation under grant MIP-9311980. The original version +of this code was written as part of a project to build a fixed-point vector +processor in collaboration with the University of California at Berkeley, +overseen by Profs. Nelson Morgan and John Wawrzynek. More information +is available through the Web page `http://www.cs.berkeley.edu/~jhauser/ +arithmetic/SoftFloat.html'. + +THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has +been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES +RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS +AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES, +COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE +EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE +INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR +OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE. + +Derivative works are acceptable, even for commercial purposes, so long as +(1) the source code for the derivative work includes prominent notice that +the work is derivative, and (2) the source code includes prominent notice with +these four paragraphs for those parts of this code that are retained. +=============================================================================*/ + +/*============================================================================ + * Adapted for Bochs (x86 achitecture simulator) by + * Stanislav Shwartsman [sshwarts at sourceforge net] + * ==========================================================================*/ + +#ifndef _SOFTFLOAT_HH_ +#define _SOFTFLOAT_HH_ + +#include +#include "softfloat_ns.hh" + +// This lets us figure out what the byte order of the host system is +#if defined(__linux__) +#include +#elif defined (__sun) +#include +#else +#include +#endif + +#if defined(_BIG_ENDIAN) || BYTE_ORDER == BIG_ENDIAN +#define SOFTFLOAT_BIG_ENDIAN +#elif defined(_LITTLE_ENDIAN) || BYTE_ORDER == LITTLE_ENDIAN +#define SOFTFLOAT_LITTLE_ENDIAN +#else +#error Cannot determine host byte order. +#endif + +BEGIN_SOFTFLOAT_NS + +#define FLOAT16 +#define FLOATX80 + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE floating-point types. +*----------------------------------------------------------------------------*/ +#ifdef FLOAT16 +typedef uint16_t float16; +#endif +typedef uint32_t float32; +typedef uint64_t float64; + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE floating-point class. +*----------------------------------------------------------------------------*/ +typedef enum { + float_zero, + float_NaN, + float_negative_inf, + float_positive_inf, + float_denormal, + float_normalized +} float_class_t; + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE floating-point NaN operands handling mode. +*----------------------------------------------------------------------------*/ +enum float_nan_handling_mode_t { + float_larger_significand_nan = 0, // this mode used by x87 FPU + float_first_operand_nan = 1 // this mode used by SSE +}; + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE floating-point rounding mode. +*----------------------------------------------------------------------------*/ +enum float_round_t { + float_round_nearest_even = 0, + float_round_down = 1, + float_round_up = 2, + float_round_to_zero = 3 +}; + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE floating-point exception flags. +*----------------------------------------------------------------------------*/ +enum float_exception_flag_t { + float_flag_invalid = 0x01, + float_flag_denormal = 0x02, + float_flag_divbyzero = 0x04, + float_flag_overflow = 0x08, + float_flag_underflow = 0x10, + float_flag_inexact = 0x20 +}; + +#ifdef FLOATX80 +#define RAISE_SW_C1 0x0200 +#endif + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE floating-point ordering relations +*----------------------------------------------------------------------------*/ +enum { + float_relation_less = -1, + float_relation_equal = 0, + float_relation_greater = 1, + float_relation_unordered = 2 +}; + +/*---------------------------------------------------------------------------- +| Options to indicate which negations to perform in float*_muladd() +| Using these differs from negating an input or output before calling +| the muladd function in that this means that a NaN doesn't have its +| sign bit inverted before it is propagated. +*----------------------------------------------------------------------------*/ +enum { + float_muladd_negate_c = 1, + float_muladd_negate_product = 2, + float_muladd_negate_result = float_muladd_negate_c | float_muladd_negate_product +}; + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE floating-point status structure. +*----------------------------------------------------------------------------*/ +struct float_status_t +{ +#ifdef FLOATX80 + int float_rounding_precision; /* floatx80 only */ +#endif + int float_rounding_mode; + int float_exception_flags; + int float_exception_masks; + int float_nan_handling_mode; /* flag register */ + int flush_underflow_to_zero; /* flag register */ + int denormals_are_zeros; /* flag register */ +}; + +/*---------------------------------------------------------------------------- +| Routine to raise any or all of the software IEC/IEEE floating-point +| exception flags. +*----------------------------------------------------------------------------*/ + +inline void float_raise(float_status_t &status, int flags) +{ + status.float_exception_flags |= flags; +} + +/*---------------------------------------------------------------------------- +| Routine to check if any or all of the software IEC/IEEE floating-point +| exceptions are masked. +*----------------------------------------------------------------------------*/ + +inline int float_exception_masked(const float_status_t &status, int flag) +{ + return status.float_exception_masks & flag; +} + +/*---------------------------------------------------------------------------- +| Returns current floating point rounding mode specified by status word. +*----------------------------------------------------------------------------*/ + +inline int get_float_rounding_mode(const float_status_t &status) +{ + return status.float_rounding_mode; +} + +/*---------------------------------------------------------------------------- +| Returns current floating point precision (floatx80 only). +*----------------------------------------------------------------------------*/ + +#ifdef FLOATX80 +inline int get_float_rounding_precision(const float_status_t &status) +{ + return status.float_rounding_precision; +} +#endif + +/*---------------------------------------------------------------------------- +| Returns current floating point NaN operands handling mode specified +| by status word. +*----------------------------------------------------------------------------*/ + +inline int get_float_nan_handling_mode(const float_status_t &status) +{ + return status.float_nan_handling_mode; +} + +/*---------------------------------------------------------------------------- +| Raise floating point precision lost up flag (floatx80 only). +*----------------------------------------------------------------------------*/ + +#ifdef FLOATX80 +inline void set_float_rounding_up(float_status_t &status) +{ + status.float_exception_flags |= RAISE_SW_C1; +} +#endif + +/*---------------------------------------------------------------------------- +| Returns 1 if the feature is supported; +| otherwise returns 0. +*----------------------------------------------------------------------------*/ + +inline int get_denormals_are_zeros(const float_status_t &status) +{ + return status.denormals_are_zeros; +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the feature is supported; +| otherwise returns 0. +*----------------------------------------------------------------------------*/ + +inline int get_flush_underflow_to_zero(const float_status_t &status) +{ + return status.flush_underflow_to_zero; +} + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE integer-to-floating-point conversion routines. +*----------------------------------------------------------------------------*/ +float32 int32_to_float32(int32_t, float_status_t &status); +float64 int32_to_float64(int32_t); +float32 int64_to_float32(int64_t, float_status_t &status); +float64 int64_to_float64(int64_t, float_status_t &status); + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE single-precision conversion routines. +*----------------------------------------------------------------------------*/ +int32_t float32_to_int32(float32, float_status_t &status); +int32_t float32_to_int32_round_to_zero(float32, float_status_t &status); +int64_t float32_to_int64(float32, float_status_t &status); +int64_t float32_to_int64_round_to_zero(float32, float_status_t &status); +float64 float32_to_float64(float32, float_status_t &status); + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE single-precision operations. +*----------------------------------------------------------------------------*/ +float32 float32_round_to_int(float32, float_status_t &status); +float32 float32_add(float32, float32, float_status_t &status); +float32 float32_sub(float32, float32, float_status_t &status); +float32 float32_mul(float32, float32, float_status_t &status); +float32 float32_div(float32, float32, float_status_t &status); +float32 float32_sqrt(float32, float_status_t &status); +float32 float32_frc(float32, float_status_t &status); +float32 float32_muladd(float32, float32, float32, int flags, float_status_t &status); + +inline float32 float32_fmadd(float32 a, float32 b, float32 c, float_status_t &status) +{ + return float32_muladd(a, b, c, 0, status); +} + +inline float32 float32_fmsub(float32 a, float32 b, float32 c, float_status_t &status) +{ + return float32_muladd(a, b, c, float_muladd_negate_c, status); +} + +inline float32 float32_fnmadd(float32 a, float32 b, float32 c, float_status_t &status) +{ + return float32_muladd(a, b, c, float_muladd_negate_product, status); +} + +inline float32 float32_fnmsub(float32 a, float32 b, float32 c, float_status_t &status) +{ + return float32_muladd(a, b, c, float_muladd_negate_result, status); +} + +int float32_compare(float32, float32, float_status_t &status); +int float32_compare_quiet(float32, float32, float_status_t &status); + +float_class_t float32_class(float32); +int float32_is_signaling_nan(float32); +int float32_is_nan(float32); +int float32_is_denormal(float32); + +float32 float32_min(float32 a, float32 b, float_status_t &status); +float32 float32_max(float32 a, float32 b, float_status_t &status); + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE double-precision conversion routines. +*----------------------------------------------------------------------------*/ +int32_t float64_to_int32(float64, float_status_t &status); +int32_t float64_to_int32_round_to_zero(float64, float_status_t &status); +int64_t float64_to_int64(float64, float_status_t &status); +int64_t float64_to_int64_round_to_zero(float64, float_status_t &status); +float32 float64_to_float32(float64, float_status_t &status); + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE double-precision operations. +*----------------------------------------------------------------------------*/ +float64 float64_round_to_int(float64, float_status_t &status); +float64 float64_add(float64, float64, float_status_t &status); +float64 float64_sub(float64, float64, float_status_t &status); +float64 float64_mul(float64, float64, float_status_t &status); +float64 float64_div(float64, float64, float_status_t &status); +float64 float64_sqrt(float64, float_status_t &status); +float64 float64_frc(float64, float_status_t &status); +float64 float64_muladd(float64, float64, float64, int flags, float_status_t &status); + +inline float64 float64_fmadd(float64 a, float64 b, float64 c, float_status_t &status) +{ + return float64_muladd(a, b, c, 0, status); +} + +inline float64 float64_fmsub(float64 a, float64 b, float64 c, float_status_t &status) +{ + return float64_muladd(a, b, c, float_muladd_negate_c, status); +} + +inline float64 float64_fnmadd(float64 a, float64 b, float64 c, float_status_t &status) +{ + return float64_muladd(a, b, c, float_muladd_negate_product, status); +} + +inline float64 float64_fnmsub(float64 a, float64 b, float64 c, float_status_t &status) +{ + return float64_muladd(a, b, c, float_muladd_negate_result, status); +} + +int float64_compare(float64, float64, float_status_t &status); +int float64_compare_quiet(float64, float64, float_status_t &status); + +float_class_t float64_class(float64); +int float64_is_signaling_nan(float64); +int float64_is_nan(float64); +int float64_is_denormal(float64); + +float64 float64_min(float64 a, float64 b, float_status_t &status); +float64 float64_max(float64 a, float64 b, float_status_t &status); + +#ifdef FLOAT16 +float32 float16_to_float32(float16, float_status_t &status); +float16 float32_to_float16(float32, float_status_t &status); + +float_class_t float16_class(float16); +int float16_is_signaling_nan(float16); +int float16_is_nan(float16); +int float16_is_denormal(float16); +#endif + +#ifdef FLOATX80 + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE floating-point types. +*----------------------------------------------------------------------------*/ + +#if defined(SOFTFLOAT_BIG_ENDIAN) +struct floatx80 { // leave alignment to compiler + uint16_t exp; + uint64_t fraction; +}; +#elif defined(SOFTFLOAT_LITTLE_ENDIAN) +struct floatx80 { + uint64_t fraction; + uint16_t exp; +}; +#endif + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE integer-to-floating-point conversion routines. +*----------------------------------------------------------------------------*/ +floatx80 int32_to_floatx80(int32_t); +floatx80 int64_to_floatx80(int64_t); + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE extended double-precision conversion routines. +*----------------------------------------------------------------------------*/ +floatx80 float32_to_floatx80(float32, float_status_t &status); +floatx80 float64_to_floatx80(float64, float_status_t &status); + +int32_t floatx80_to_int32(floatx80, float_status_t &status); +int32_t floatx80_to_int32_round_to_zero(floatx80, float_status_t &status); +int64_t floatx80_to_int64(floatx80, float_status_t &status); +int64_t floatx80_to_int64_round_to_zero(floatx80, float_status_t &status); + +float32 floatx80_to_float32(floatx80, float_status_t &status); +float64 floatx80_to_float64(floatx80, float_status_t &status); + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE extended double-precision operations. +*----------------------------------------------------------------------------*/ +floatx80 floatx80_round_to_int(floatx80, float_status_t &status); +floatx80 floatx80_add(floatx80, floatx80, float_status_t &status); +floatx80 floatx80_sub(floatx80, floatx80, float_status_t &status); +floatx80 floatx80_mul(floatx80, floatx80, float_status_t &status); +floatx80 floatx80_div(floatx80, floatx80, float_status_t &status); +floatx80 floatx80_sqrt(floatx80, float_status_t &status); + +float_class_t floatx80_class(floatx80); +int floatx80_is_signaling_nan(floatx80); +int floatx80_is_nan(floatx80); + +#endif /* FLOATX80 */ + +#ifdef FLOAT128 + +#ifdef BX_BIG_ENDIAN +struct float128 { + uint64_t hi, lo; +}; +#else +struct float128 { + uint64_t lo, hi; +}; +#endif + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE quadruple-precision conversion routines. +*----------------------------------------------------------------------------*/ +float128 floatx80_to_float128(floatx80 a, float_status_t &status); +floatx80 float128_to_floatx80(float128 a, float_status_t &status); + +float128 int64_to_float128(int64_t a); + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE extended double-precision operations. +*----------------------------------------------------------------------------*/ +floatx80 floatx80_mul(floatx80 a, float128 b, float_status_t &status); + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE quadruple-precision operations. +*----------------------------------------------------------------------------*/ +float128 float128_add(float128 a, float128 b, float_status_t &status); +float128 float128_sub(float128 a, float128 b, float_status_t &status); +float128 float128_mul(float128 a, float128 b, float_status_t &status); +float128 float128_div(float128 a, float128 b, float_status_t &status); + +#endif /* FLOAT128 */ + +END_SOFTFLOAT_NS + +#endif diff -r 372d3611c693 -r e89a7cf22da3 ext/softfloat/softfloat.cc --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ext/softfloat/softfloat.cc Thu Sep 19 18:09:15 2013 +0200 @@ -0,0 +1,3208 @@ +/*============================================================================ +This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic +Package, Release 2b. + +Written by John R. Hauser. This work was made possible in part by the +International Computer Science Institute, located at Suite 600, 1947 Center +Street, Berkeley, California 94704. Funding was partially provided by the +National Science Foundation under grant MIP-9311980. The original version +of this code was written as part of a project to build a fixed-point vector +processor in collaboration with the University of California at Berkeley, +overseen by Profs. Nelson Morgan and John Wawrzynek. More information +is available through the Web page `http://www.cs.berkeley.edu/~jhauser/ +arithmetic/SoftFloat.html'. + +THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has +been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES +RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS +AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES, +COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE +EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE +INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR +OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE. + +Derivative works are acceptable, even for commercial purposes, so long as +(1) the source code for the derivative work includes prominent notice that +the work is derivative, and (2) the source code includes prominent notice with +these four paragraphs for those parts of this code that are retained. +=============================================================================*/ + +#define FLOAT128 + +/*============================================================================ + * Adapted for Bochs (x86 achitecture simulator) by + * Stanislav Shwartsman [sshwarts at sourceforge net] + * ==========================================================================*/ + +#include "softfloat.hh" +#include "softfloat-round-pack.hh" + +/*---------------------------------------------------------------------------- +| Primitive arithmetic functions, including multi-word arithmetic, and +| division and square root approximations. (Can be specialized to target +| if desired). +*----------------------------------------------------------------------------*/ +#define USE_estimateDiv128To64 +#define USE_estimateSqrt32 +#include "softfloat-macros.hh" + +/*---------------------------------------------------------------------------- +| Functions and definitions to determine: (1) whether tininess for underflow +| is detected before or after rounding by default, (2) what (if anything) +| happens when exceptions are raised, (3) how signaling NaNs are distinguished +| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs +| are propagated from function inputs to output. These details are target- +| specific. +*----------------------------------------------------------------------------*/ +#include "softfloat-specialize.hh" + +BEGIN_SOFTFLOAT_NS + +/*---------------------------------------------------------------------------- +| Returns the result of converting the 32-bit two's complement integer `a' +| to the single-precision floating-point format. The conversion is performed +| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float32 int32_to_float32(int32_t a, float_status_t &status) +{ + if (a == 0) return 0; + if (a == (int32_t) 0x80000000) return packFloat32(1, 0x9E, 0); + int zSign = (a < 0); + return normalizeRoundAndPackFloat32(zSign, 0x9C, zSign ? -a : a, status); +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the 32-bit two's complement integer `a' +| to the double-precision floating-point format. The conversion is performed +| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float64 int32_to_float64(int32_t a) +{ + if (a == 0) return 0; + int zSign = (a < 0); + uint32_t absA = zSign ? -a : a; + int shiftCount = countLeadingZeros32(absA) + 21; + uint64_t zSig = absA; + return packFloat64(zSign, 0x432 - shiftCount, zSig<>(-shiftCount); + if ((uint32_t) (aSig<<(shiftCount & 31))) { + float_raise(status, float_flag_inexact); + } + if (aSign) z = -z; + return z; +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the single-precision floating-point value +| `a' to the 64-bit two's complement integer format. The conversion is +| performed according to the IEC/IEEE Standard for Binary Floating-Point +| Arithmetic - which means in particular that the conversion is rounded +| according to the current rounding mode. If `a' is a NaN or the +| conversion overflows, the integer indefinite value is returned. +*----------------------------------------------------------------------------*/ + +int64_t float32_to_int64(float32 a, float_status_t &status) +{ + uint64_t aSig64, aSigExtra; + + uint32_t aSig = extractFloat32Frac(a); + int16_t aExp = extractFloat32Exp(a); + int aSign = extractFloat32Sign(a); + + int shiftCount = 0xBE - aExp; + if (shiftCount < 0) { + float_raise(status, float_flag_invalid); + return (int64_t)(int64_indefinite); + } + if (aExp) aSig |= 0x00800000; + else { + if (get_denormals_are_zeros(status)) aSig = 0; + } + aSig64 = aSig; + aSig64 <<= 40; + shift64ExtraRightJamming(aSig64, 0, shiftCount, &aSig64, &aSigExtra); + return roundAndPackInt64(aSign, aSig64, aSigExtra, status); +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the single-precision floating-point value +| `a' to the 64-bit two's complement integer format. The conversion is +| performed according to the IEC/IEEE Standard for Binary Floating-Point +| Arithmetic, except that the conversion is always rounded toward zero. +| If `a' is a NaN or the conversion overflows, the integer indefinite +| value is returned. +*----------------------------------------------------------------------------*/ + +int64_t float32_to_int64_round_to_zero(float32 a, float_status_t &status) +{ + int aSign; + int16_t aExp; + uint32_t aSig; + uint64_t aSig64; + int64_t z; + + aSig = extractFloat32Frac(a); + aExp = extractFloat32Exp(a); + aSign = extractFloat32Sign(a); + int shiftCount = aExp - 0xBE; + if (0 <= shiftCount) { + if (a != 0xDF000000) { + float_raise(status, float_flag_invalid); + } + return (int64_t)(int64_indefinite); + } + else if (aExp <= 0x7E) { + if (get_denormals_are_zeros(status) && aExp == 0) aSig = 0; + if (aExp | aSig) float_raise(status, float_flag_inexact); + return 0; + } + aSig64 = aSig | 0x00800000; + aSig64 <<= 40; + z = aSig64>>(-shiftCount); + if ((uint64_t) (aSig64<<(shiftCount & 63))) { + float_raise(status, float_flag_inexact); + } + if (aSign) z = -z; + return z; +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the single-precision floating-point value +| `a' to the double-precision floating-point format. The conversion is +| performed according to the IEC/IEEE Standard for Binary Floating-Point +| Arithmetic. +*----------------------------------------------------------------------------*/ + +float64 float32_to_float64(float32 a, float_status_t &status) +{ + uint32_t aSig = extractFloat32Frac(a); + int16_t aExp = extractFloat32Exp(a); + int aSign = extractFloat32Sign(a); + + if (aExp == 0xFF) { + if (aSig) return commonNaNToFloat64(float32ToCommonNaN(a, status)); + return packFloat64(aSign, 0x7FF, 0); + } + if (aExp == 0) { + if (get_denormals_are_zeros(status)) aSig = 0; + if (aSig == 0) return packFloat64(aSign, 0, 0); + float_raise(status, float_flag_denormal); + normalizeFloat32Subnormal(aSig, &aExp, &aSig); + --aExp; + } + return packFloat64(aSign, aExp + 0x380, ((uint64_t) aSig)<<29); +} + +/*---------------------------------------------------------------------------- +| Rounds the single-precision floating-point value `a' to an integer, and +| returns the result as a single-precision floating-point value. The +| operation is performed according to the IEC/IEEE Standard for Binary +| Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float32 float32_round_to_int(float32 a, float_status_t &status) +{ + uint32_t lastBitMask, roundBitsMask; + int roundingMode = get_float_rounding_mode(status); + + int16_t aExp = extractFloat32Exp(a); + if (0x96 <= aExp) { + if ((aExp == 0xFF) && extractFloat32Frac(a)) { + return propagateFloat32NaN(a, status); + } + return a; + } + + if (get_denormals_are_zeros(status)) { + a = float32_denormal_to_zero(a); + } + + if (aExp <= 0x7E) { + if ((uint32_t) (a<<1) == 0) return a; + float_raise(status, float_flag_inexact); + int aSign = extractFloat32Sign(a); + switch (roundingMode) { + case float_round_nearest_even: + if ((aExp == 0x7E) && extractFloat32Frac(a)) { + return packFloat32(aSign, 0x7F, 0); + } + break; + case float_round_down: + return aSign ? 0xBF800000 : 0; + case float_round_up: + return aSign ? 0x80000000 : 0x3F800000; + } + return packFloat32(aSign, 0, 0); + } + lastBitMask = 1; + lastBitMask <<= 0x96 - aExp; + roundBitsMask = lastBitMask - 1; + float32 z = a; + if (roundingMode == float_round_nearest_even) { + z += lastBitMask>>1; + if ((z & roundBitsMask) == 0) z &= ~lastBitMask; + } + else if (roundingMode != float_round_to_zero) { + if (extractFloat32Sign(z) ^ (roundingMode == float_round_up)) { + z += roundBitsMask; + } + } + z &= ~roundBitsMask; + if (z != a) float_raise(status, float_flag_inexact); + return z; +} + +/*---------------------------------------------------------------------------- +| Extracts the fractional portion of single-precision floating-point value `a', +| and returns the result as a single-precision floating-point value. The +| fractional results are precise. The operation is performed according to the +| IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float32 float32_frc(float32 a, float_status_t &status) +{ + int roundingMode = get_float_rounding_mode(status); + + int16_t aExp = extractFloat32Exp(a); + uint32_t aSig = extractFloat32Frac(a); + int aSign = extractFloat32Sign(a); + + if (aExp == 0xFF) { + if (aSig) return propagateFloat32NaN(a, status); + float_raise(status, float_flag_invalid); + return float32_default_nan; + } + + if (aExp >= 0x96) { + return packFloat32(roundingMode == float_round_down, 0, 0); + } + + if (aExp < 0x7F) { + if (aExp == 0) { + if (get_denormals_are_zeros(status)) aSig = 0; + if (aSig == 0) { + return packFloat32(roundingMode == float_round_down, 0, 0); + } + + float_raise(status, float_flag_denormal); + if (! float_exception_masked(status, float_flag_underflow)) + float_raise(status, float_flag_underflow); + + if(get_flush_underflow_to_zero(status)) { + float_raise(status, float_flag_underflow | float_flag_inexact); + return packFloat32(aSign, 0, 0); + } + } + return a; + } + + uint32_t lastBitMask = 1 << (0x96 - aExp); + uint32_t roundBitsMask = lastBitMask - 1; + + aSig &= roundBitsMask; + aSig <<= 7; + aExp--; + + if (aSig == 0) + return packFloat32(roundingMode == float_round_down, 0, 0); + + return normalizeRoundAndPackFloat32(aSign, aExp, aSig, status); +} + +/*---------------------------------------------------------------------------- +| Returns the result of adding the absolute values of the single-precision +| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated +| before being returned. `zSign' is ignored if the result is a NaN. +| The addition is performed according to the IEC/IEEE Standard for Binary +| Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +static float32 addFloat32Sigs(float32 a, float32 b, int zSign, float_status_t &status) +{ + int16_t aExp, bExp, zExp; + uint32_t aSig, bSig, zSig; + int16_t expDiff; + + aSig = extractFloat32Frac(a); + aExp = extractFloat32Exp(a); + bSig = extractFloat32Frac(b); + bExp = extractFloat32Exp(b); + + if (get_denormals_are_zeros(status)) { + if (aExp == 0) aSig = 0; + if (bExp == 0) bSig = 0; + } + + expDiff = aExp - bExp; + aSig <<= 6; + bSig <<= 6; + + if (0 < expDiff) { + if (aExp == 0xFF) { + if (aSig) return propagateFloat32NaN(a, b, status); + if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal); + return a; + } + if ((aExp == 0) && aSig) + float_raise(status, float_flag_denormal); + + if (bExp == 0) { + if (bSig) float_raise(status, float_flag_denormal); + --expDiff; + } + else bSig |= 0x20000000; + + bSig = shift32RightJamming(bSig, expDiff); + zExp = aExp; + } + else if (expDiff < 0) { + if (bExp == 0xFF) { + if (bSig) return propagateFloat32NaN(a, b, status); + if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal); + return packFloat32(zSign, 0xFF, 0); + } + if ((bExp == 0) && bSig) + float_raise(status, float_flag_denormal); + + if (aExp == 0) { + if (aSig) float_raise(status, float_flag_denormal); + ++expDiff; + } + else aSig |= 0x20000000; + + aSig = shift32RightJamming(aSig, -expDiff); + zExp = bExp; + } + else { + if (aExp == 0xFF) { + if (aSig | bSig) return propagateFloat32NaN(a, b, status); + return a; + } + if (aExp == 0) { + zSig = (aSig + bSig) >> 6; + if (aSig | bSig) { + float_raise(status, float_flag_denormal); + if (get_flush_underflow_to_zero(status)) { + float_raise(status, float_flag_underflow | float_flag_inexact); + return packFloat32(zSign, 0, 0); + } + if (! float_exception_masked(status, float_flag_underflow)) { + if (extractFloat32Frac(zSig) == zSig) + float_raise(status, float_flag_underflow); + } + } + return packFloat32(zSign, 0, zSig); + } + zSig = 0x40000000 + aSig + bSig; + return roundAndPackFloat32(zSign, aExp, zSig, status); + } + aSig |= 0x20000000; + zSig = (aSig + bSig)<<1; + --zExp; + if ((int32_t) zSig < 0) { + zSig = aSig + bSig; + ++zExp; + } + return roundAndPackFloat32(zSign, zExp, zSig, status); +} + +/*---------------------------------------------------------------------------- +| Returns the result of subtracting the absolute values of the single- +| precision floating-point values `a' and `b'. If `zSign' is 1, the +| difference is negated before being returned. `zSign' is ignored if the +| result is a NaN. The subtraction is performed according to the IEC/IEEE +| Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +static float32 subFloat32Sigs(float32 a, float32 b, int zSign, float_status_t &status) +{ + int16_t aExp, bExp, zExp; + uint32_t aSig, bSig, zSig; + int16_t expDiff; + + aSig = extractFloat32Frac(a); + aExp = extractFloat32Exp(a); + bSig = extractFloat32Frac(b); + bExp = extractFloat32Exp(b); + + if (get_denormals_are_zeros(status)) { + if (aExp == 0) aSig = 0; + if (bExp == 0) bSig = 0; + } + + expDiff = aExp - bExp; + aSig <<= 7; + bSig <<= 7; + if (0 < expDiff) goto aExpBigger; + if (expDiff < 0) goto bExpBigger; + if (aExp == 0xFF) { + if (aSig | bSig) return propagateFloat32NaN(a, b, status); + float_raise(status, float_flag_invalid); + return float32_default_nan; + } + if (aExp == 0) { + if (aSig | bSig) float_raise(status, float_flag_denormal); + aExp = 1; + bExp = 1; + } + if (bSig < aSig) goto aBigger; + if (aSig < bSig) goto bBigger; + return packFloat32(get_float_rounding_mode(status) == float_round_down, 0, 0); + bExpBigger: + if (bExp == 0xFF) { + if (bSig) return propagateFloat32NaN(a, b, status); + if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal); + return packFloat32(zSign ^ 1, 0xFF, 0); + } + if ((bExp == 0) && bSig) + float_raise(status, float_flag_denormal); + + if (aExp == 0) { + if (aSig) float_raise(status, float_flag_denormal); + ++expDiff; + } + else aSig |= 0x40000000; + + aSig = shift32RightJamming(aSig, -expDiff); + bSig |= 0x40000000; + bBigger: + zSig = bSig - aSig; + zExp = bExp; + zSign ^= 1; + goto normalizeRoundAndPack; + aExpBigger: + if (aExp == 0xFF) { + if (aSig) return propagateFloat32NaN(a, b, status); + if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal); + return a; + } + if ((aExp == 0) && aSig) + float_raise(status, float_flag_denormal); + + if (bExp == 0) { + if (bSig) float_raise(status, float_flag_denormal); + --expDiff; + } + else bSig |= 0x40000000; + + bSig = shift32RightJamming(bSig, expDiff); + aSig |= 0x40000000; + aBigger: + zSig = aSig - bSig; + zExp = aExp; + normalizeRoundAndPack: + --zExp; + return normalizeRoundAndPackFloat32(zSign, zExp, zSig, status); +} + +/*---------------------------------------------------------------------------- +| Returns the result of adding the single-precision floating-point values `a' +| and `b'. The operation is performed according to the IEC/IEEE Standard for +| Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float32 float32_add(float32 a, float32 b, float_status_t &status) +{ + int aSign = extractFloat32Sign(a); + int bSign = extractFloat32Sign(b); + + if (aSign == bSign) { + return addFloat32Sigs(a, b, aSign, status); + } + else { + return subFloat32Sigs(a, b, aSign, status); + } +} + +/*---------------------------------------------------------------------------- +| Returns the result of subtracting the single-precision floating-point values +| `a' and `b'. The operation is performed according to the IEC/IEEE Standard +| for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float32 float32_sub(float32 a, float32 b, float_status_t &status) +{ + int aSign = extractFloat32Sign(a); + int bSign = extractFloat32Sign(b); + + if (aSign == bSign) { + return subFloat32Sigs(a, b, aSign, status); + } + else { + return addFloat32Sigs(a, b, aSign, status); + } +} + +/*---------------------------------------------------------------------------- +| Returns the result of multiplying the single-precision floating-point values +| `a' and `b'. The operation is performed according to the IEC/IEEE Standard +| for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float32 float32_mul(float32 a, float32 b, float_status_t &status) +{ + int aSign, bSign, zSign; + int16_t aExp, bExp, zExp; + uint32_t aSig, bSig; + uint64_t zSig64; + uint32_t zSig; + + aSig = extractFloat32Frac(a); + aExp = extractFloat32Exp(a); + aSign = extractFloat32Sign(a); + bSig = extractFloat32Frac(b); + bExp = extractFloat32Exp(b); + bSign = extractFloat32Sign(b); + zSign = aSign ^ bSign; + + if (get_denormals_are_zeros(status)) { + if (aExp == 0) aSig = 0; + if (bExp == 0) bSig = 0; + } + + if (aExp == 0xFF) { + if (aSig || ((bExp == 0xFF) && bSig)) + return propagateFloat32NaN(a, b, status); + + if ((bExp | bSig) == 0) { + float_raise(status, float_flag_invalid); + return float32_default_nan; + } + if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal); + return packFloat32(zSign, 0xFF, 0); + } + if (bExp == 0xFF) { + if (bSig) return propagateFloat32NaN(a, b, status); + if ((aExp | aSig) == 0) { + float_raise(status, float_flag_invalid); + return float32_default_nan; + } + if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal); + return packFloat32(zSign, 0xFF, 0); + } + if (aExp == 0) { + if (aSig == 0) { + if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal); + return packFloat32(zSign, 0, 0); + } + float_raise(status, float_flag_denormal); + normalizeFloat32Subnormal(aSig, &aExp, &aSig); + } + if (bExp == 0) { + if (bSig == 0) return packFloat32(zSign, 0, 0); + float_raise(status, float_flag_denormal); + normalizeFloat32Subnormal(bSig, &bExp, &bSig); + } + zExp = aExp + bExp - 0x7F; + aSig = (aSig | 0x00800000)<<7; + bSig = (bSig | 0x00800000)<<8; + zSig64 = shift64RightJamming(((uint64_t) aSig) * bSig, 32); + zSig = (uint32_t) zSig64; + if (0 <= (int32_t) (zSig<<1)) { + zSig <<= 1; + --zExp; + } + return roundAndPackFloat32(zSign, zExp, zSig, status); +} + +/*---------------------------------------------------------------------------- +| Returns the result of dividing the single-precision floating-point value `a' +| by the corresponding value `b'. The operation is performed according to the +| IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float32 float32_div(float32 a, float32 b, float_status_t &status) +{ + int aSign, bSign, zSign; + int16_t aExp, bExp, zExp; + uint32_t aSig, bSig, zSig; + + aSig = extractFloat32Frac(a); + aExp = extractFloat32Exp(a); + aSign = extractFloat32Sign(a); + bSig = extractFloat32Frac(b); + bExp = extractFloat32Exp(b); + bSign = extractFloat32Sign(b); + zSign = aSign ^ bSign; + + if (get_denormals_are_zeros(status)) { + if (aExp == 0) aSig = 0; + if (bExp == 0) bSig = 0; + } + + if (aExp == 0xFF) { + if (aSig) return propagateFloat32NaN(a, b, status); + if (bExp == 0xFF) { + if (bSig) return propagateFloat32NaN(a, b, status); + float_raise(status, float_flag_invalid); + return float32_default_nan; + } + if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal); + return packFloat32(zSign, 0xFF, 0); + } + if (bExp == 0xFF) { + if (bSig) return propagateFloat32NaN(a, b, status); + if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal); + return packFloat32(zSign, 0, 0); + } + if (bExp == 0) { + if (bSig == 0) { + if ((aExp | aSig) == 0) { + float_raise(status, float_flag_invalid); + return float32_default_nan; + } + float_raise(status, float_flag_divbyzero); + return packFloat32(zSign, 0xFF, 0); + } + float_raise(status, float_flag_denormal); + normalizeFloat32Subnormal(bSig, &bExp, &bSig); + } + if (aExp == 0) { + if (aSig == 0) return packFloat32(zSign, 0, 0); + float_raise(status, float_flag_denormal); + normalizeFloat32Subnormal(aSig, &aExp, &aSig); + } + zExp = aExp - bExp + 0x7D; + aSig = (aSig | 0x00800000)<<7; + bSig = (bSig | 0x00800000)<<8; + if (bSig <= (aSig + aSig)) { + aSig >>= 1; + ++zExp; + } + zSig = (((uint64_t) aSig)<<32) / bSig; + if ((zSig & 0x3F) == 0) { + zSig |= ((uint64_t) bSig * zSig != ((uint64_t) aSig)<<32); + } + return roundAndPackFloat32(zSign, zExp, zSig, status); +} + +/*---------------------------------------------------------------------------- +| Returns the square root of the single-precision floating-point value `a'. +| The operation is performed according to the IEC/IEEE Standard for Binary +| Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float32 float32_sqrt(float32 a, float_status_t &status) +{ + int aSign; + int16_t aExp, zExp; + uint32_t aSig, zSig; + uint64_t rem, term; + + aSig = extractFloat32Frac(a); + aExp = extractFloat32Exp(a); + aSign = extractFloat32Sign(a); + + if (aExp == 0xFF) { + if (aSig) return propagateFloat32NaN(a, status); + if (! aSign) return a; + float_raise(status, float_flag_invalid); + return float32_default_nan; + } + + if (get_denormals_are_zeros(status)) { + if (aExp == 0) aSig = 0; + } + + if (aSign) { + if ((aExp | aSig) == 0) return packFloat32(aSign, 0, 0); + float_raise(status, float_flag_invalid); + return float32_default_nan; + } + if (aExp == 0) { + if (aSig == 0) return 0; + float_raise(status, float_flag_denormal); + normalizeFloat32Subnormal(aSig, &aExp, &aSig); + } + zExp = ((aExp - 0x7F)>>1) + 0x7E; + aSig = (aSig | 0x00800000)<<8; + zSig = estimateSqrt32(aExp, aSig) + 2; + if ((zSig & 0x7F) <= 5) { + if (zSig < 2) { + zSig = 0x7FFFFFFF; + goto roundAndPack; + } + aSig >>= aExp & 1; + term = ((uint64_t) zSig) * zSig; + rem = (((uint64_t) aSig)<<32) - term; + while ((int64_t) rem < 0) { + --zSig; + rem += (((uint64_t) zSig)<<1) | 1; + } + zSig |= (rem != 0); + } + zSig = shift32RightJamming(zSig, 1); + roundAndPack: + return roundAndPackFloat32(0, zExp, zSig, status); +} + +/*---------------------------------------------------------------------------- +| Determine single-precision floating-point number class +*----------------------------------------------------------------------------*/ + +float_class_t float32_class(float32 a) +{ + int16_t aExp = extractFloat32Exp(a); + uint32_t aSig = extractFloat32Frac(a); + int aSign = extractFloat32Sign(a); + + if(aExp == 0xFF) { + if (aSig == 0) + return (aSign) ? float_negative_inf : float_positive_inf; + + return float_NaN; + } + + if(aExp == 0) { + if (aSig == 0) return float_zero; + return float_denormal; + } + + return float_normalized; +} + +/*---------------------------------------------------------------------------- +| Compare between two single precision floating point numbers. Returns +| 'float_relation_equal' if the operands are equal, 'float_relation_less' if +| the value 'a' is less than the corresponding value `b', +| 'float_relation_greater' if the value 'a' is greater than the corresponding +| value `b', or 'float_relation_unordered' otherwise. +*----------------------------------------------------------------------------*/ + +int float32_compare(float32 a, float32 b, float_status_t &status) +{ + if (get_denormals_are_zeros(status)) { + a = float32_denormal_to_zero(a); + b = float32_denormal_to_zero(b); + } + + float_class_t aClass = float32_class(a); + float_class_t bClass = float32_class(b); + + if (aClass == float_NaN || bClass == float_NaN) { + float_raise(status, float_flag_invalid); + return float_relation_unordered; + } + + if (aClass == float_denormal || bClass == float_denormal) + { + float_raise(status, float_flag_denormal); + } + + if ((a == b) || ((uint32_t) ((a | b)<<1) == 0)) return float_relation_equal; + + int aSign = extractFloat32Sign(a); + int bSign = extractFloat32Sign(b); + if (aSign != bSign) + return (aSign) ? float_relation_less : float_relation_greater; + + if (aSign ^ (a < b)) return float_relation_less; + return float_relation_greater; +} + +/*---------------------------------------------------------------------------- +| Compare between two double precision floating point numbers. Returns +| 'float_relation_equal' if the operands are equal, 'float_relation_less' if +| the value 'a' is less than the corresponding value `b', +| 'float_relation_greater' if the value 'a' is greater than the corresponding +| value `b', or 'float_relation_unordered' otherwise. Quiet NaNs do not cause +| an exception. +*----------------------------------------------------------------------------*/ + +int float32_compare_quiet(float32 a, float32 b, float_status_t &status) +{ + if (get_denormals_are_zeros(status)) { + a = float32_denormal_to_zero(a); + b = float32_denormal_to_zero(b); + } + + float_class_t aClass = float32_class(a); + float_class_t bClass = float32_class(b); + + if (aClass == float_NaN || bClass == float_NaN) + { + if (float32_is_signaling_nan(a) || float32_is_signaling_nan(b)) + { + float_raise(status, float_flag_invalid); + } + return float_relation_unordered; + } + + if (aClass == float_denormal || bClass == float_denormal) + { + float_raise(status, float_flag_denormal); + } + + if ((a == b) || ((uint32_t) ((a | b)<<1) == 0)) return float_relation_equal; + + int aSign = extractFloat32Sign(a); + int bSign = extractFloat32Sign(b); + if (aSign != bSign) + return (aSign) ? float_relation_less : float_relation_greater; + + if (aSign ^ (a < b)) return float_relation_less; + return float_relation_greater; +} + +/*---------------------------------------------------------------------------- +| Compare bewteen two single precision floating point numbers and return the +| smaller of them. The operation is performed according to the IEC/IEEE +| Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float32 float32_min(float32 a, float32 b, float_status_t &status) +{ + if (get_denormals_are_zeros(status)) { + a = float32_denormal_to_zero(a); + b = float32_denormal_to_zero(b); + } + + return (float32_compare(a, b, status) == float_relation_less) ? a : b; +} + +/*---------------------------------------------------------------------------- +| Compare bewteen two single precision floating point numbers and return the +| larger of them. The operation is performed according to the IEC/IEEE +| Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float32 float32_max(float32 a, float32 b, float_status_t &status) +{ + if (get_denormals_are_zeros(status)) { + a = float32_denormal_to_zero(a); + b = float32_denormal_to_zero(b); + } + + return (float32_compare(a, b, status) == float_relation_greater) ? a : b; +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the double-precision floating-point value +| `a' to the 32-bit two's complement integer format. The conversion is +| performed according to the IEC/IEEE Standard for Binary Floating-Point +| Arithmetic - which means in particular that the conversion is rounded +| according to the current rounding mode. If `a' is a NaN or the +| conversion overflows, the integer indefinite value is returned. +*----------------------------------------------------------------------------*/ + +int32_t float64_to_int32(float64 a, float_status_t &status) +{ + uint64_t aSig = extractFloat64Frac(a); + int16_t aExp = extractFloat64Exp(a); + int aSign = extractFloat64Sign(a); + if ((aExp == 0x7FF) && aSig) aSign = 0; + if (aExp) aSig |= UINT64_C(0x0010000000000000); + else { + if (get_denormals_are_zeros(status)) aSig = 0; + } + int shiftCount = 0x42C - aExp; + if (0 < shiftCount) aSig = shift64RightJamming(aSig, shiftCount); + return roundAndPackInt32(aSign, aSig, status); +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the double-precision floating-point value +| `a' to the 32-bit two's complement integer format. The conversion is +| performed according to the IEC/IEEE Standard for Binary Floating-Point +| Arithmetic, except that the conversion is always rounded toward zero. +| If `a' is a NaN or the conversion overflows, the integer indefinite +| value is returned. +*----------------------------------------------------------------------------*/ + +int32_t float64_to_int32_round_to_zero(float64 a, float_status_t &status) +{ + int aSign; + int16_t aExp; + uint64_t aSig, savedASig; + int32_t z; + int shiftCount; + + aSig = extractFloat64Frac(a); + aExp = extractFloat64Exp(a); + aSign = extractFloat64Sign(a); + if (0x41E < aExp) { + if ((aExp == 0x7FF) && aSig) aSign = 0; + goto invalid; + } + else if (aExp < 0x3FF) { + if (get_denormals_are_zeros(status) && aExp == 0) aSig = 0; + if (aExp || aSig) float_raise(status, float_flag_inexact); + return 0; + } + aSig |= UINT64_C(0x0010000000000000); + shiftCount = 0x433 - aExp; + savedASig = aSig; + aSig >>= shiftCount; + z = (int32_t) aSig; + if (aSign) z = -z; + if ((z < 0) ^ aSign) { + invalid: + float_raise(status, float_flag_invalid); + return (int32_t)(int32_indefinite); + } + if ((aSig<>(-shiftCount); + if ((uint64_t) (aSig<<(shiftCount & 63))) { + float_raise(status, float_flag_inexact); + } + } + if (aSign) z = -z; + return z; +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the double-precision floating-point value +| `a' to the single-precision floating-point format. The conversion is +| performed according to the IEC/IEEE Standard for Binary Floating-Point +| Arithmetic. +*----------------------------------------------------------------------------*/ + +float32 float64_to_float32(float64 a, float_status_t &status) +{ + int aSign; + int16_t aExp; + uint64_t aSig; + uint32_t zSig; + + aSig = extractFloat64Frac(a); + aExp = extractFloat64Exp(a); + aSign = extractFloat64Sign(a); + if (aExp == 0x7FF) { + if (aSig) return commonNaNToFloat32(float64ToCommonNaN(a, status)); + return packFloat32(aSign, 0xFF, 0); + } + if (aExp == 0) { + if (get_denormals_are_zeros(status)) aSig = 0; + if (aSig == 0) return packFloat32(aSign, 0, 0); + float_raise(status, float_flag_denormal); + } + aSig = shift64RightJamming(aSig, 22); + zSig = (uint32_t) aSig; + if (aExp || zSig) { + zSig |= 0x40000000; + aExp -= 0x381; + } + return roundAndPackFloat32(aSign, aExp, zSig, status); +} + +/*---------------------------------------------------------------------------- +| Rounds the double-precision floating-point value `a' to an integer, and +| returns the result as a double-precision floating-point value. The +| operation is performed according to the IEC/IEEE Standard for Binary +| Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float64 float64_round_to_int(float64 a, float_status_t &status) +{ + int16_t aExp; + uint64_t lastBitMask, roundBitsMask; + int roundingMode = get_float_rounding_mode(status); + float64 z; + + aExp = extractFloat64Exp(a); + if (0x433 <= aExp) { + if ((aExp == 0x7FF) && extractFloat64Frac(a)) { + return propagateFloat64NaN(a, status); + } + return a; + } + + if (get_denormals_are_zeros(status)) { + a = float64_denormal_to_zero(a); + } + + if (aExp < 0x3FF) { + if ((uint64_t) (a<<1) == 0) return a; + float_raise(status, float_flag_inexact); + int aSign = extractFloat64Sign(a); + switch (roundingMode) { + case float_round_nearest_even: + if ((aExp == 0x3FE) && extractFloat64Frac(a)) { + return packFloat64(aSign, 0x3FF, 0); + } + break; + case float_round_down: + return aSign ? UINT64_C(0xBFF0000000000000) : 0; + case float_round_up: + return + aSign ? UINT64_C(0x8000000000000000) : UINT64_C(0x3FF0000000000000); + } + return packFloat64(aSign, 0, 0); + } + lastBitMask = 1; + lastBitMask <<= 0x433 - aExp; + roundBitsMask = lastBitMask - 1; + z = a; + if (roundingMode == float_round_nearest_even) { + z += lastBitMask>>1; + if ((z & roundBitsMask) == 0) z &= ~lastBitMask; + } + else if (roundingMode != float_round_to_zero) { + if (extractFloat64Sign(z) ^ (roundingMode == float_round_up)) { + z += roundBitsMask; + } + } + z &= ~roundBitsMask; + if (z != a) float_raise(status, float_flag_inexact); + return z; +} + +/*---------------------------------------------------------------------------- +| Extracts the fractional portion of double-precision floating-point value `a', +| and returns the result as a double-precision floating-point value. The +| fractional results are precise. The operation is performed according to the +| IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float64 float64_frc(float64 a, float_status_t &status) +{ + int roundingMode = get_float_rounding_mode(status); + + uint64_t aSig = extractFloat64Frac(a); + int16_t aExp = extractFloat64Exp(a); + int aSign = extractFloat64Sign(a); + + if (aExp == 0x7FF) { + if (aSig) return propagateFloat64NaN(a, status); + float_raise(status, float_flag_invalid); + return float64_default_nan; + } + + if (aExp >= 0x433) { + return packFloat64(roundingMode == float_round_down, 0, 0); + } + + if (aExp < 0x3FF) { + if (aExp == 0) { + if (get_denormals_are_zeros(status)) aSig = 0; + if (aSig == 0) { + return packFloat64(roundingMode == float_round_down, 0, 0); + } + + float_raise(status, float_flag_denormal); + if (! float_exception_masked(status, float_flag_underflow)) + float_raise(status, float_flag_underflow); + + if(get_flush_underflow_to_zero(status)) { + float_raise(status, float_flag_underflow | float_flag_inexact); + return packFloat64(aSign, 0, 0); + } + } + return a; + } + + uint64_t lastBitMask = UINT64_C(1) << (0x433 - aExp); + uint64_t roundBitsMask = lastBitMask - 1; + + aSig &= roundBitsMask; + aSig <<= 10; + aExp--; + + if (aSig == 0) + return packFloat64(roundingMode == float_round_down, 0, 0); + + return normalizeRoundAndPackFloat64(aSign, aExp, aSig, status); +} + +/*---------------------------------------------------------------------------- +| Returns the result of adding the absolute values of the double-precision +| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated +| before being returned. `zSign' is ignored if the result is a NaN. +| The addition is performed according to the IEC/IEEE Standard for Binary +| Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +static float64 addFloat64Sigs(float64 a, float64 b, int zSign, float_status_t &status) +{ + int16_t aExp, bExp, zExp; + uint64_t aSig, bSig, zSig; + int16_t expDiff; + + aSig = extractFloat64Frac(a); + aExp = extractFloat64Exp(a); + bSig = extractFloat64Frac(b); + bExp = extractFloat64Exp(b); + + if (get_denormals_are_zeros(status)) { + if (aExp == 0) aSig = 0; + if (bExp == 0) bSig = 0; + } + + expDiff = aExp - bExp; + aSig <<= 9; + bSig <<= 9; + if (0 < expDiff) { + if (aExp == 0x7FF) { + if (aSig) return propagateFloat64NaN(a, b, status); + if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal); + return a; + } + if ((aExp == 0) && aSig) + float_raise(status, float_flag_denormal); + + if (bExp == 0) { + if (bSig) float_raise(status, float_flag_denormal); + --expDiff; + } + else bSig |= UINT64_C(0x2000000000000000); + + bSig = shift64RightJamming(bSig, expDiff); + zExp = aExp; + } + else if (expDiff < 0) { + if (bExp == 0x7FF) { + if (bSig) return propagateFloat64NaN(a, b, status); + if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal); + return packFloat64(zSign, 0x7FF, 0); + } + if ((bExp == 0) && bSig) + float_raise(status, float_flag_denormal); + + if (aExp == 0) { + if (aSig) float_raise(status, float_flag_denormal); + ++expDiff; + } + else aSig |= UINT64_C(0x2000000000000000); + + aSig = shift64RightJamming(aSig, -expDiff); + zExp = bExp; + } + else { + if (aExp == 0x7FF) { + if (aSig | bSig) return propagateFloat64NaN(a, b, status); + return a; + } + if (aExp == 0) { + zSig = (aSig + bSig) >> 9; + if (aSig | bSig) { + float_raise(status, float_flag_denormal); + if (get_flush_underflow_to_zero(status)) { + float_raise(status, float_flag_underflow | float_flag_inexact); + return packFloat64(zSign, 0, 0); + } + if (! float_exception_masked(status, float_flag_underflow)) { + if (extractFloat64Frac(zSig) == zSig) + float_raise(status, float_flag_underflow); + } + } + return packFloat64(zSign, 0, zSig); + } + zSig = UINT64_C(0x4000000000000000) + aSig + bSig; + return roundAndPackFloat64(zSign, aExp, zSig, status); + } + aSig |= UINT64_C(0x2000000000000000); + zSig = (aSig + bSig)<<1; + --zExp; + if ((int64_t) zSig < 0) { + zSig = aSig + bSig; + ++zExp; + } + return roundAndPackFloat64(zSign, zExp, zSig, status); +} + +/*---------------------------------------------------------------------------- +| Returns the result of subtracting the absolute values of the double- +| precision floating-point values `a' and `b'. If `zSign' is 1, the +| difference is negated before being returned. `zSign' is ignored if the +| result is a NaN. The subtraction is performed according to the IEC/IEEE +| Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +static float64 subFloat64Sigs(float64 a, float64 b, int zSign, float_status_t &status) +{ + int16_t aExp, bExp, zExp; + uint64_t aSig, bSig, zSig; + int16_t expDiff; + + aSig = extractFloat64Frac(a); + aExp = extractFloat64Exp(a); + bSig = extractFloat64Frac(b); + bExp = extractFloat64Exp(b); + + if (get_denormals_are_zeros(status)) { + if (aExp == 0) aSig = 0; + if (bExp == 0) bSig = 0; + } + + expDiff = aExp - bExp; + aSig <<= 10; + bSig <<= 10; + if (0 < expDiff) goto aExpBigger; + if (expDiff < 0) goto bExpBigger; + if (aExp == 0x7FF) { + if (aSig | bSig) return propagateFloat64NaN(a, b, status); + float_raise(status, float_flag_invalid); + return float64_default_nan; + } + if (aExp == 0) { + if (aSig | bSig) float_raise(status, float_flag_denormal); + aExp = 1; + bExp = 1; + } + if (bSig < aSig) goto aBigger; + if (aSig < bSig) goto bBigger; + return packFloat64(get_float_rounding_mode(status) == float_round_down, 0, 0); + bExpBigger: + if (bExp == 0x7FF) { + if (bSig) return propagateFloat64NaN(a, b, status); + if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal); + return packFloat64(zSign ^ 1, 0x7FF, 0); + } + if ((bExp == 0) && bSig) + float_raise(status, float_flag_denormal); + + if (aExp == 0) { + if (aSig) float_raise(status, float_flag_denormal); + ++expDiff; + } + else aSig |= UINT64_C(0x4000000000000000); + + aSig = shift64RightJamming(aSig, -expDiff); + bSig |= UINT64_C(0x4000000000000000); + bBigger: + zSig = bSig - aSig; + zExp = bExp; + zSign ^= 1; + goto normalizeRoundAndPack; + aExpBigger: + if (aExp == 0x7FF) { + if (aSig) return propagateFloat64NaN(a, b, status); + if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal); + return a; + } + if ((aExp == 0) && aSig) + float_raise(status, float_flag_denormal); + + if (bExp == 0) { + if (bSig) float_raise(status, float_flag_denormal); + --expDiff; + } + else bSig |= UINT64_C(0x4000000000000000); + + bSig = shift64RightJamming(bSig, expDiff); + aSig |= UINT64_C(0x4000000000000000); + aBigger: + zSig = aSig - bSig; + zExp = aExp; + normalizeRoundAndPack: + --zExp; + return normalizeRoundAndPackFloat64(zSign, zExp, zSig, status); +} + +/*---------------------------------------------------------------------------- +| Returns the result of adding the double-precision floating-point values `a' +| and `b'. The operation is performed according to the IEC/IEEE Standard for +| Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float64 float64_add(float64 a, float64 b, float_status_t &status) +{ + int aSign = extractFloat64Sign(a); + int bSign = extractFloat64Sign(b); + + if (aSign == bSign) { + return addFloat64Sigs(a, b, aSign, status); + } + else { + return subFloat64Sigs(a, b, aSign, status); + } +} + +/*---------------------------------------------------------------------------- +| Returns the result of subtracting the double-precision floating-point values +| `a' and `b'. The operation is performed according to the IEC/IEEE Standard +| for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float64 float64_sub(float64 a, float64 b, float_status_t &status) +{ + int aSign = extractFloat64Sign(a); + int bSign = extractFloat64Sign(b); + + if (aSign == bSign) { + return subFloat64Sigs(a, b, aSign, status); + } + else { + return addFloat64Sigs(a, b, aSign, status); + } +} + +/*---------------------------------------------------------------------------- +| Returns the result of multiplying the double-precision floating-point values +| `a' and `b'. The operation is performed according to the IEC/IEEE Standard +| for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float64 float64_mul(float64 a, float64 b, float_status_t &status) +{ + int aSign, bSign, zSign; + int16_t aExp, bExp, zExp; + uint64_t aSig, bSig, zSig0, zSig1; + + aSig = extractFloat64Frac(a); + aExp = extractFloat64Exp(a); + aSign = extractFloat64Sign(a); + bSig = extractFloat64Frac(b); + bExp = extractFloat64Exp(b); + bSign = extractFloat64Sign(b); + zSign = aSign ^ bSign; + + if (get_denormals_are_zeros(status)) { + if (aExp == 0) aSig = 0; + if (bExp == 0) bSig = 0; + } + + if (aExp == 0x7FF) { + if (aSig || ((bExp == 0x7FF) && bSig)) { + return propagateFloat64NaN(a, b, status); + } + if ((bExp | bSig) == 0) { + float_raise(status, float_flag_invalid); + return float64_default_nan; + } + if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal); + return packFloat64(zSign, 0x7FF, 0); + } + if (bExp == 0x7FF) { + if (bSig) return propagateFloat64NaN(a, b, status); + if ((aExp | aSig) == 0) { + float_raise(status, float_flag_invalid); + return float64_default_nan; + } + if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal); + return packFloat64(zSign, 0x7FF, 0); + } + if (aExp == 0) { + if (aSig == 0) { + if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal); + return packFloat64(zSign, 0, 0); + } + float_raise(status, float_flag_denormal); + normalizeFloat64Subnormal(aSig, &aExp, &aSig); + } + if (bExp == 0) { + if (bSig == 0) return packFloat64(zSign, 0, 0); + float_raise(status, float_flag_denormal); + normalizeFloat64Subnormal(bSig, &bExp, &bSig); + } + zExp = aExp + bExp - 0x3FF; + aSig = (aSig | UINT64_C(0x0010000000000000))<<10; + bSig = (bSig | UINT64_C(0x0010000000000000))<<11; + mul64To128(aSig, bSig, &zSig0, &zSig1); + zSig0 |= (zSig1 != 0); + if (0 <= (int64_t) (zSig0<<1)) { + zSig0 <<= 1; + --zExp; + } + return roundAndPackFloat64(zSign, zExp, zSig0, status); +} + +/*---------------------------------------------------------------------------- +| Returns the result of dividing the double-precision floating-point value `a' +| by the corresponding value `b'. The operation is performed according to +| the IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float64 float64_div(float64 a, float64 b, float_status_t &status) +{ + int aSign, bSign, zSign; + int16_t aExp, bExp, zExp; + uint64_t aSig, bSig, zSig; + uint64_t rem0, rem1; + uint64_t term0, term1; + + aSig = extractFloat64Frac(a); + aExp = extractFloat64Exp(a); + aSign = extractFloat64Sign(a); + bSig = extractFloat64Frac(b); + bExp = extractFloat64Exp(b); + bSign = extractFloat64Sign(b); + zSign = aSign ^ bSign; + + if (get_denormals_are_zeros(status)) { + if (aExp == 0) aSig = 0; + if (bExp == 0) bSig = 0; + } + + if (aExp == 0x7FF) { + if (aSig) return propagateFloat64NaN(a, b, status); + if (bExp == 0x7FF) { + if (bSig) return propagateFloat64NaN(a, b, status); + float_raise(status, float_flag_invalid); + return float64_default_nan; + } + if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal); + return packFloat64(zSign, 0x7FF, 0); + } + if (bExp == 0x7FF) { + if (bSig) return propagateFloat64NaN(a, b, status); + if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal); + return packFloat64(zSign, 0, 0); + } + if (bExp == 0) { + if (bSig == 0) { + if ((aExp | aSig) == 0) { + float_raise(status, float_flag_invalid); + return float64_default_nan; + } + float_raise(status, float_flag_divbyzero); + return packFloat64(zSign, 0x7FF, 0); + } + float_raise(status, float_flag_denormal); + normalizeFloat64Subnormal(bSig, &bExp, &bSig); + } + if (aExp == 0) { + if (aSig == 0) return packFloat64(zSign, 0, 0); + float_raise(status, float_flag_denormal); + normalizeFloat64Subnormal(aSig, &aExp, &aSig); + } + zExp = aExp - bExp + 0x3FD; + aSig = (aSig | UINT64_C(0x0010000000000000))<<10; + bSig = (bSig | UINT64_C(0x0010000000000000))<<11; + if (bSig <= (aSig + aSig)) { + aSig >>= 1; + ++zExp; + } + zSig = estimateDiv128To64(aSig, 0, bSig); + if ((zSig & 0x1FF) <= 2) { + mul64To128(bSig, zSig, &term0, &term1); + sub128(aSig, 0, term0, term1, &rem0, &rem1); + while ((int64_t) rem0 < 0) { + --zSig; + add128(rem0, rem1, 0, bSig, &rem0, &rem1); + } + zSig |= (rem1 != 0); + } + return roundAndPackFloat64(zSign, zExp, zSig, status); +} + +/*---------------------------------------------------------------------------- +| Returns the square root of the double-precision floating-point value `a'. +| The operation is performed according to the IEC/IEEE Standard for Binary +| Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float64 float64_sqrt(float64 a, float_status_t &status) +{ + int aSign; + int16_t aExp, zExp; + uint64_t aSig, zSig, doubleZSig; + uint64_t rem0, rem1, term0, term1; + + aSig = extractFloat64Frac(a); + aExp = extractFloat64Exp(a); + aSign = extractFloat64Sign(a); + + if (aExp == 0x7FF) { + if (aSig) return propagateFloat64NaN(a, status); + if (! aSign) return a; + float_raise(status, float_flag_invalid); + return float64_default_nan; + } + + if (get_denormals_are_zeros(status)) { + if (aExp == 0) aSig = 0; + } + + if (aSign) { + if ((aExp | aSig) == 0) return packFloat64(aSign, 0, 0); + float_raise(status, float_flag_invalid); + return float64_default_nan; + } + if (aExp == 0) { + if (aSig == 0) return 0; + float_raise(status, float_flag_denormal); + normalizeFloat64Subnormal(aSig, &aExp, &aSig); + } + zExp = ((aExp - 0x3FF)>>1) + 0x3FE; + aSig |= UINT64_C(0x0010000000000000); + zSig = estimateSqrt32(aExp, (uint32_t)(aSig>>21)); + aSig <<= 9 - (aExp & 1); + zSig = estimateDiv128To64(aSig, 0, zSig<<32) + (zSig<<30); + if ((zSig & 0x1FF) <= 5) { + doubleZSig = zSig<<1; + mul64To128(zSig, zSig, &term0, &term1); + sub128(aSig, 0, term0, term1, &rem0, &rem1); + while ((int64_t) rem0 < 0) { + --zSig; + doubleZSig -= 2; + add128(rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1); + } + zSig |= ((rem0 | rem1) != 0); + } + return roundAndPackFloat64(0, zExp, zSig, status); +} + +/*---------------------------------------------------------------------------- +| Determine double-precision floating-point number class +*----------------------------------------------------------------------------*/ + +float_class_t float64_class(float64 a) +{ + int16_t aExp = extractFloat64Exp(a); + uint64_t aSig = extractFloat64Frac(a); + int aSign = extractFloat64Sign(a); + + if(aExp == 0x7FF) { + if (aSig == 0) + return (aSign) ? float_negative_inf : float_positive_inf; + + return float_NaN; + } + + if(aExp == 0) { + if (aSig == 0) + return float_zero; + return float_denormal; + } + + return float_normalized; +} + +/*---------------------------------------------------------------------------- +| Compare between two double precision floating point numbers. Returns +| 'float_relation_equal' if the operands are equal, 'float_relation_less' if +| the value 'a' is less than the corresponding value `b', +| 'float_relation_greater' if the value 'a' is greater than the corresponding +| value `b', or 'float_relation_unordered' otherwise. +*----------------------------------------------------------------------------*/ + +int float64_compare(float64 a, float64 b, float_status_t &status) +{ + if (get_denormals_are_zeros(status)) { + a = float64_denormal_to_zero(a); + b = float64_denormal_to_zero(b); + } + + float_class_t aClass = float64_class(a); + float_class_t bClass = float64_class(b); + + if (aClass == float_NaN || bClass == float_NaN) { + float_raise(status, float_flag_invalid); + return float_relation_unordered; + } + + if (aClass == float_denormal || bClass == float_denormal) + { + float_raise(status, float_flag_denormal); + } + + if ((a == b) || ((uint64_t) ((a | b)<<1) == 0)) return float_relation_equal; + + int aSign = extractFloat64Sign(a); + int bSign = extractFloat64Sign(b); + if (aSign != bSign) + return (aSign) ? float_relation_less : float_relation_greater; + + if (aSign ^ (a < b)) return float_relation_less; + return float_relation_greater; +} + +/*---------------------------------------------------------------------------- +| Compare between two double precision floating point numbers. Returns +| 'float_relation_equal' if the operands are equal, 'float_relation_less' if +| the value 'a' is less than the corresponding value `b', +| 'float_relation_greater' if the value 'a' is greater than the corresponding +| value `b', or 'float_relation_unordered' otherwise. Quiet NaNs do not cause +| an exception. +*----------------------------------------------------------------------------*/ + +int float64_compare_quiet(float64 a, float64 b, float_status_t &status) +{ + if (get_denormals_are_zeros(status)) { + a = float64_denormal_to_zero(a); + b = float64_denormal_to_zero(b); + } + + float_class_t aClass = float64_class(a); + float_class_t bClass = float64_class(b); + + if (aClass == float_NaN || bClass == float_NaN) + { + if (float64_is_signaling_nan(a) || float64_is_signaling_nan(b)) + { + float_raise(status, float_flag_invalid); + } + return float_relation_unordered; + } + + if (aClass == float_denormal || bClass == float_denormal) + { + float_raise(status, float_flag_denormal); + } + + if ((a == b) || ((uint64_t) ((a | b)<<1) == 0)) return float_relation_equal; + + int aSign = extractFloat64Sign(a); + int bSign = extractFloat64Sign(b); + if (aSign != bSign) + return (aSign) ? float_relation_less : float_relation_greater; + + if (aSign ^ (a < b)) return float_relation_less; + return float_relation_greater; +} + +/*---------------------------------------------------------------------------- +| Compare bewteen two double precision floating point numbers and return the +| smaller of them. The operation is performed according to the IEC/IEEE +| Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float64 float64_min(float64 a, float64 b, float_status_t &status) +{ + if (get_denormals_are_zeros(status)) { + a = float64_denormal_to_zero(a); + b = float64_denormal_to_zero(b); + } + + return (float64_compare(a, b, status) == float_relation_less) ? a : b; +} + +/*---------------------------------------------------------------------------- +| Compare bewteen two double precision floating point numbers and return the +| larger of them. The operation is performed according to the IEC/IEEE +| Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float64 float64_max(float64 a, float64 b, float_status_t &status) +{ + if (get_denormals_are_zeros(status)) { + a = float64_denormal_to_zero(a); + b = float64_denormal_to_zero(b); + } + + return (float64_compare(a, b, status) == float_relation_greater) ? a : b; +} + +#ifdef FLOATX80 + +/*---------------------------------------------------------------------------- +| Returns the result of converting the 32-bit two's complement integer `a' +| to the extended double-precision floating-point format. The conversion +| is performed according to the IEC/IEEE Standard for Binary Floating-Point +| Arithmetic. +*----------------------------------------------------------------------------*/ + +floatx80 int32_to_floatx80(int32_t a) +{ + if (a == 0) return packFloatx80(0, 0, 0); + int zSign = (a < 0); + uint32_t absA = zSign ? -a : a; + int shiftCount = countLeadingZeros32(absA) + 32; + uint64_t zSig = absA; + return packFloatx80(zSign, 0x403E - shiftCount, zSig< 0x401E) goto invalid; + if (aExp < 0x3FFF) { + if (aExp || aSig) float_raise(status, float_flag_inexact); + return 0; + } + shiftCount = 0x403E - aExp; + savedASig = aSig; + aSig >>= shiftCount; + z = (int32_t) aSig; + if (aSign) z = -z; + if ((z < 0) ^ aSign) { + invalid: + float_raise(status, float_flag_invalid); + return (int32_t)(int32_indefinite); + } + if ((aSig<>(-shiftCount); + if ((uint64_t) (aSig<<(shiftCount & 63))) { + float_raise(status, float_flag_inexact); + } + if (aSign) z = -z; + return z; +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the extended double-precision floating- +| point value `a' to the single-precision floating-point format. The +| conversion is performed according to the IEC/IEEE Standard for Binary +| Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float32 floatx80_to_float32(floatx80 a, float_status_t &status) +{ + uint64_t aSig = extractFloatx80Frac(a); + int32_t aExp = extractFloatx80Exp(a); + int aSign = extractFloatx80Sign(a); + + // handle unsupported extended double-precision floating encodings + if (floatx80_is_unsupported(a)) + { + float_raise(status, float_flag_invalid); + return float32_default_nan; + } + + if (aExp == 0x7FFF) { + if ((uint64_t) (aSig<<1)) + return commonNaNToFloat32(floatx80ToCommonNaN(a, status)); + + return packFloat32(aSign, 0xFF, 0); + } + aSig = shift64RightJamming(aSig, 33); + if (aExp || aSig) aExp -= 0x3F81; + return roundAndPackFloat32(aSign, aExp, (uint32_t) aSig, status); +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the extended double-precision floating- +| point value `a' to the double-precision floating-point format. The +| conversion is performed according to the IEC/IEEE Standard for Binary +| Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float64 floatx80_to_float64(floatx80 a, float_status_t &status) +{ + int32_t aExp; + uint64_t aSig, zSig; + + // handle unsupported extended double-precision floating encodings + if (floatx80_is_unsupported(a)) + { + float_raise(status, float_flag_invalid); + return float64_default_nan; + } + + aSig = extractFloatx80Frac(a); + aExp = extractFloatx80Exp(a); + int aSign = extractFloatx80Sign(a); + + if (aExp == 0x7FFF) { + if ((uint64_t) (aSig<<1)) { + return commonNaNToFloat64(floatx80ToCommonNaN(a, status)); + } + return packFloat64(aSign, 0x7FF, 0); + } + zSig = shift64RightJamming(aSig, 1); + if (aExp || aSig) aExp -= 0x3C01; + return roundAndPackFloat64(aSign, aExp, zSig, status); +} + +/*---------------------------------------------------------------------------- +| Rounds the extended double-precision floating-point value `a' to an integer, +| and returns the result as an extended double-precision floating-point +| value. The operation is performed according to the IEC/IEEE Standard for +| Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +floatx80 floatx80_round_to_int(floatx80 a, float_status_t &status) +{ + int aSign; + uint64_t lastBitMask, roundBitsMask; + int roundingMode = get_float_rounding_mode(status); + floatx80 z; + + // handle unsupported extended double-precision floating encodings + if (floatx80_is_unsupported(a)) + { + float_raise(status, float_flag_invalid); + return floatx80_default_nan; + } + + int32_t aExp = extractFloatx80Exp(a); + uint64_t aSig = extractFloatx80Frac(a); + if (0x403E <= aExp) { + if ((aExp == 0x7FFF) && (uint64_t) (aSig<<1)) { + return propagateFloatx80NaN(a, status); + } + return a; + } + if (aExp < 0x3FFF) { + if (aExp == 0) { + if ((aSig<<1) == 0) return a; + float_raise(status, float_flag_denormal); + } + float_raise(status, float_flag_inexact); + aSign = extractFloatx80Sign(a); + switch (roundingMode) { + case float_round_nearest_even: + if ((aExp == 0x3FFE) && (uint64_t) (aSig<<1)) { + set_float_rounding_up(status); + return packFloatx80(aSign, 0x3FFF, UINT64_C(0x8000000000000000)); + } + break; + case float_round_down: + if (aSign) { + set_float_rounding_up(status); + return packFloatx80(1, 0x3FFF, UINT64_C(0x8000000000000000)); + } + else { + return packFloatx80(0, 0, 0); + } + case float_round_up: + if (aSign) { + return packFloatx80(1, 0, 0); + } + else { + set_float_rounding_up(status); + return packFloatx80(0, 0x3FFF, UINT64_C(0x8000000000000000)); + } + } + return packFloatx80(aSign, 0, 0); + } + lastBitMask = 1; + lastBitMask <<= 0x403E - aExp; + roundBitsMask = lastBitMask - 1; + z = a; + if (roundingMode == float_round_nearest_even) { + z.fraction += lastBitMask>>1; + if ((z.fraction & roundBitsMask) == 0) z.fraction &= ~lastBitMask; + } + else if (roundingMode != float_round_to_zero) { + if (extractFloatx80Sign(z) ^ (roundingMode == float_round_up)) + z.fraction += roundBitsMask; + } + z.fraction &= ~roundBitsMask; + if (z.fraction == 0) { + z.exp++; + z.fraction = UINT64_C(0x8000000000000000); + } + if (z.fraction != a.fraction) { + float_raise(status, float_flag_inexact); + if (z.fraction > a.fraction || z.exp > a.exp) + set_float_rounding_up(status); + } + return z; +} + +/*---------------------------------------------------------------------------- +| Returns the result of adding the absolute values of the extended double- +| precision floating-point values `a' and `b'. If `zSign' is 1, the sum is +| negated before being returned. `zSign' is ignored if the result is a NaN. +| The addition is performed according to the IEC/IEEE Standard for Binary +| Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, int zSign, float_status_t &status) +{ + int32_t aExp, bExp, zExp; + uint64_t aSig, bSig, zSig0, zSig1; + + // handle unsupported extended double-precision floating encodings + if (floatx80_is_unsupported(a) || floatx80_is_unsupported(b)) + { + float_raise(status, float_flag_invalid); + return floatx80_default_nan; + } + + aSig = extractFloatx80Frac(a); + aExp = extractFloatx80Exp(a); + bSig = extractFloatx80Frac(b); + bExp = extractFloatx80Exp(b); + + if (aExp == 0x7FFF) { + if ((uint64_t) (aSig<<1) || ((bExp == 0x7FFF) && (uint64_t) (bSig<<1))) + return propagateFloatx80NaN(a, b, status); + if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal); + return a; + } + if (bExp == 0x7FFF) { + if ((uint64_t) (bSig<<1)) return propagateFloatx80NaN(a, b, status); + if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal); + return packFloatx80(zSign, 0x7FFF, UINT64_C(0x8000000000000000)); + } + if (aExp == 0) { + if (aSig == 0) { + if ((bExp == 0) && bSig) { + float_raise(status, float_flag_denormal); + normalizeFloatx80Subnormal(bSig, &bExp, &bSig); + } + return roundAndPackFloatx80(get_float_rounding_precision(status), + zSign, bExp, bSig, 0, status); + } + float_raise(status, float_flag_denormal); + normalizeFloatx80Subnormal(aSig, &aExp, &aSig); + } + if (bExp == 0) { + if (bSig == 0) + return roundAndPackFloatx80(get_float_rounding_precision(status), + zSign, aExp, aSig, 0, status); + + float_raise(status, float_flag_denormal); + normalizeFloatx80Subnormal(bSig, &bExp, &bSig); + } + int32_t expDiff = aExp - bExp; + zExp = aExp; + if (0 < expDiff) { + shift64ExtraRightJamming(bSig, 0, expDiff, &bSig, &zSig1); + } + else if (expDiff < 0) { + shift64ExtraRightJamming(aSig, 0, -expDiff, &aSig, &zSig1); + zExp = bExp; + } + else { + zSig0 = aSig + bSig; + zSig1 = 0; + goto shiftRight1; + } + zSig0 = aSig + bSig; + if ((int64_t) zSig0 < 0) goto roundAndPack; + shiftRight1: + shift64ExtraRightJamming(zSig0, zSig1, 1, &zSig0, &zSig1); + zSig0 |= UINT64_C(0x8000000000000000); + zExp++; + roundAndPack: + return + roundAndPackFloatx80(get_float_rounding_precision(status), + zSign, zExp, zSig0, zSig1, status); +} + +/*---------------------------------------------------------------------------- +| Returns the result of subtracting the absolute values of the extended +| double-precision floating-point values `a' and `b'. If `zSign' is 1, the +| difference is negated before being returned. `zSign' is ignored if the +| result is a NaN. The subtraction is performed according to the IEC/IEEE +| Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, int zSign, float_status_t &status) +{ + int32_t aExp, bExp, zExp; + uint64_t aSig, bSig, zSig0, zSig1; + + // handle unsupported extended double-precision floating encodings + if (floatx80_is_unsupported(a) || floatx80_is_unsupported(b)) + { + float_raise(status, float_flag_invalid); + return floatx80_default_nan; + } + + aSig = extractFloatx80Frac(a); + aExp = extractFloatx80Exp(a); + bSig = extractFloatx80Frac(b); + bExp = extractFloatx80Exp(b); + + if (aExp == 0x7FFF) { + if ((uint64_t) (aSig<<1)) return propagateFloatx80NaN(a, b, status); + if (bExp == 0x7FFF) { + if ((uint64_t) (bSig<<1)) return propagateFloatx80NaN(a, b, status); + float_raise(status, float_flag_invalid); + return floatx80_default_nan; + } + if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal); + return a; + } + if (bExp == 0x7FFF) { + if ((uint64_t) (bSig<<1)) return propagateFloatx80NaN(a, b, status); + if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal); + return packFloatx80(zSign ^ 1, 0x7FFF, UINT64_C(0x8000000000000000)); + } + if (aExp == 0) { + if (aSig == 0) { + if (bExp == 0) { + if (bSig) { + float_raise(status, float_flag_denormal); + normalizeFloatx80Subnormal(bSig, &bExp, &bSig); + return roundAndPackFloatx80(get_float_rounding_precision(status), + zSign ^ 1, bExp, bSig, 0, status); + } + return packFloatx80(get_float_rounding_mode(status) == float_round_down, 0, 0); + } + return roundAndPackFloatx80(get_float_rounding_precision(status), + zSign ^ 1, bExp, bSig, 0, status); + } + float_raise(status, float_flag_denormal); + normalizeFloatx80Subnormal(aSig, &aExp, &aSig); + } + if (bExp == 0) { + if (bSig == 0) + return roundAndPackFloatx80(get_float_rounding_precision(status), + zSign, aExp, aSig, 0, status); + + float_raise(status, float_flag_denormal); + normalizeFloatx80Subnormal(bSig, &bExp, &bSig); + } + int32_t expDiff = aExp - bExp; + if (0 < expDiff) { + shift128RightJamming(bSig, 0, expDiff, &bSig, &zSig1); + goto aBigger; + } + if (expDiff < 0) { + shift128RightJamming(aSig, 0, -expDiff, &aSig, &zSig1); + goto bBigger; + } + zSig1 = 0; + if (bSig < aSig) goto aBigger; + if (aSig < bSig) goto bBigger; + return packFloatx80(get_float_rounding_mode(status) == float_round_down, 0, 0); + bBigger: + sub128(bSig, 0, aSig, zSig1, &zSig0, &zSig1); + zExp = bExp; + zSign ^= 1; + goto normalizeRoundAndPack; + aBigger: + sub128(aSig, 0, bSig, zSig1, &zSig0, &zSig1); + zExp = aExp; + normalizeRoundAndPack: + return + normalizeRoundAndPackFloatx80(get_float_rounding_precision(status), + zSign, zExp, zSig0, zSig1, status); +} + +/*---------------------------------------------------------------------------- +| Returns the result of adding the extended double-precision floating-point +| values `a' and `b'. The operation is performed according to the IEC/IEEE +| Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +floatx80 floatx80_add(floatx80 a, floatx80 b, float_status_t &status) +{ + int aSign = extractFloatx80Sign(a); + int bSign = extractFloatx80Sign(b); + + if (aSign == bSign) + return addFloatx80Sigs(a, b, aSign, status); + else + return subFloatx80Sigs(a, b, aSign, status); +} + +/*---------------------------------------------------------------------------- +| Returns the result of subtracting the extended double-precision floating- +| point values `a' and `b'. The operation is performed according to the +| IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status_t &status) +{ + int aSign = extractFloatx80Sign(a); + int bSign = extractFloatx80Sign(b); + + if (aSign == bSign) + return subFloatx80Sigs(a, b, aSign, status); + else + return addFloatx80Sigs(a, b, aSign, status); +} + +/*---------------------------------------------------------------------------- +| Returns the result of multiplying the extended double-precision floating- +| point values `a' and `b'. The operation is performed according to the +| IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status_t &status) +{ + int aSign, bSign, zSign; + int32_t aExp, bExp, zExp; + uint64_t aSig, bSig, zSig0, zSig1; + + // handle unsupported extended double-precision floating encodings + if (floatx80_is_unsupported(a) || floatx80_is_unsupported(b)) + { + invalid: + float_raise(status, float_flag_invalid); + return floatx80_default_nan; + } + + aSig = extractFloatx80Frac(a); + aExp = extractFloatx80Exp(a); + aSign = extractFloatx80Sign(a); + bSig = extractFloatx80Frac(b); + bExp = extractFloatx80Exp(b); + bSign = extractFloatx80Sign(b); + zSign = aSign ^ bSign; + + if (aExp == 0x7FFF) { + if ((uint64_t) (aSig<<1) || ((bExp == 0x7FFF) && (uint64_t) (bSig<<1))) { + return propagateFloatx80NaN(a, b, status); + } + if (bExp == 0) { + if (bSig == 0) goto invalid; + float_raise(status, float_flag_denormal); + } + return packFloatx80(zSign, 0x7FFF, UINT64_C(0x8000000000000000)); + } + if (bExp == 0x7FFF) { + if ((uint64_t) (bSig<<1)) return propagateFloatx80NaN(a, b, status); + if (aExp == 0) { + if (aSig == 0) goto invalid; + float_raise(status, float_flag_denormal); + } + return packFloatx80(zSign, 0x7FFF, UINT64_C(0x8000000000000000)); + } + if (aExp == 0) { + if (aSig == 0) { + if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal); + return packFloatx80(zSign, 0, 0); + } + float_raise(status, float_flag_denormal); + normalizeFloatx80Subnormal(aSig, &aExp, &aSig); + } + if (bExp == 0) { + if (bSig == 0) return packFloatx80(zSign, 0, 0); + float_raise(status, float_flag_denormal); + normalizeFloatx80Subnormal(bSig, &bExp, &bSig); + } + zExp = aExp + bExp - 0x3FFE; + mul64To128(aSig, bSig, &zSig0, &zSig1); + if (0 < (int64_t) zSig0) { + shortShift128Left(zSig0, zSig1, 1, &zSig0, &zSig1); + --zExp; + } + return + roundAndPackFloatx80(get_float_rounding_precision(status), + zSign, zExp, zSig0, zSig1, status); +} + +/*---------------------------------------------------------------------------- +| Returns the result of dividing the extended double-precision floating-point +| value `a' by the corresponding value `b'. The operation is performed +| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +floatx80 floatx80_div(floatx80 a, floatx80 b, float_status_t &status) +{ + int aSign, bSign, zSign; + int32_t aExp, bExp, zExp; + uint64_t aSig, bSig, zSig0, zSig1; + uint64_t rem0, rem1, rem2, term0, term1, term2; + + // handle unsupported extended double-precision floating encodings + if (floatx80_is_unsupported(a) || floatx80_is_unsupported(b)) + { + float_raise(status, float_flag_invalid); + return floatx80_default_nan; + } + + aSig = extractFloatx80Frac(a); + aExp = extractFloatx80Exp(a); + aSign = extractFloatx80Sign(a); + bSig = extractFloatx80Frac(b); + bExp = extractFloatx80Exp(b); + bSign = extractFloatx80Sign(b); + + zSign = aSign ^ bSign; + if (aExp == 0x7FFF) { + if ((uint64_t) (aSig<<1)) return propagateFloatx80NaN(a, b, status); + if (bExp == 0x7FFF) { + if ((uint64_t) (bSig<<1)) return propagateFloatx80NaN(a, b, status); + float_raise(status, float_flag_invalid); + return floatx80_default_nan; + } + if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal); + return packFloatx80(zSign, 0x7FFF, UINT64_C(0x8000000000000000)); + } + if (bExp == 0x7FFF) { + if ((uint64_t) (bSig<<1)) return propagateFloatx80NaN(a, b, status); + if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal); + return packFloatx80(zSign, 0, 0); + } + if (bExp == 0) { + if (bSig == 0) { + if ((aExp | aSig) == 0) { + float_raise(status, float_flag_invalid); + return floatx80_default_nan; + } + float_raise(status, float_flag_divbyzero); + return packFloatx80(zSign, 0x7FFF, UINT64_C(0x8000000000000000)); + } + float_raise(status, float_flag_denormal); + normalizeFloatx80Subnormal(bSig, &bExp, &bSig); + } + if (aExp == 0) { + if (aSig == 0) return packFloatx80(zSign, 0, 0); + float_raise(status, float_flag_denormal); + normalizeFloatx80Subnormal(aSig, &aExp, &aSig); + } + zExp = aExp - bExp + 0x3FFE; + rem1 = 0; + if (bSig <= aSig) { + shift128Right(aSig, 0, 1, &aSig, &rem1); + ++zExp; + } + zSig0 = estimateDiv128To64(aSig, rem1, bSig); + mul64To128(bSig, zSig0, &term0, &term1); + sub128(aSig, rem1, term0, term1, &rem0, &rem1); + while ((int64_t) rem0 < 0) { + --zSig0; + add128(rem0, rem1, 0, bSig, &rem0, &rem1); + } + zSig1 = estimateDiv128To64(rem1, 0, bSig); + if ((uint64_t) (zSig1<<1) <= 8) { + mul64To128(bSig, zSig1, &term1, &term2); + sub128(rem1, 0, term1, term2, &rem1, &rem2); + while ((int64_t) rem1 < 0) { + --zSig1; + add128(rem1, rem2, 0, bSig, &rem1, &rem2); + } + zSig1 |= ((rem1 | rem2) != 0); + } + return + roundAndPackFloatx80(get_float_rounding_precision(status), + zSign, zExp, zSig0, zSig1, status); +} + +/*---------------------------------------------------------------------------- +| Returns the square root of the extended double-precision floating-point +| value `a'. The operation is performed according to the IEC/IEEE Standard +| for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +floatx80 floatx80_sqrt(floatx80 a, float_status_t &status) +{ + int aSign; + int32_t aExp, zExp; + uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0; + uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; + + // handle unsupported extended double-precision floating encodings + if (floatx80_is_unsupported(a)) + { + float_raise(status, float_flag_invalid); + return floatx80_default_nan; + } + + aSig0 = extractFloatx80Frac(a); + aExp = extractFloatx80Exp(a); + aSign = extractFloatx80Sign(a); + if (aExp == 0x7FFF) { + if ((uint64_t) (aSig0<<1)) return propagateFloatx80NaN(a, status); + if (! aSign) return a; + float_raise(status, float_flag_invalid); + return floatx80_default_nan; + } + if (aSign) { + if ((aExp | aSig0) == 0) return a; + float_raise(status, float_flag_invalid); + return floatx80_default_nan; + } + if (aExp == 0) { + if (aSig0 == 0) return packFloatx80(0, 0, 0); + float_raise(status, float_flag_denormal); + normalizeFloatx80Subnormal(aSig0, &aExp, &aSig0); + } + zExp = ((aExp - 0x3FFF)>>1) + 0x3FFF; + zSig0 = estimateSqrt32(aExp, aSig0>>32); + shift128Right(aSig0, 0, 2 + (aExp & 1), &aSig0, &aSig1); + zSig0 = estimateDiv128To64(aSig0, aSig1, zSig0<<32) + (zSig0<<30); + doubleZSig0 = zSig0<<1; + mul64To128(zSig0, zSig0, &term0, &term1); + sub128(aSig0, aSig1, term0, term1, &rem0, &rem1); + while ((int64_t) rem0 < 0) { + --zSig0; + doubleZSig0 -= 2; + add128(rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1); + } + zSig1 = estimateDiv128To64(rem1, 0, doubleZSig0); + if ((zSig1 & UINT64_C(0x3FFFFFFFFFFFFFFF)) <= 5) { + if (zSig1 == 0) zSig1 = 1; + mul64To128(doubleZSig0, zSig1, &term1, &term2); + sub128(rem1, 0, term1, term2, &rem1, &rem2); + mul64To128(zSig1, zSig1, &term2, &term3); + sub192(rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3); + while ((int64_t) rem1 < 0) { + --zSig1; + shortShift128Left(0, zSig1, 1, &term2, &term3); + term3 |= 1; + term2 |= doubleZSig0; + add192(rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3); + } + zSig1 |= ((rem1 | rem2 | rem3) != 0); + } + shortShift128Left(0, zSig1, 1, &zSig0, &zSig1); + zSig0 |= doubleZSig0; + return + roundAndPackFloatx80(get_float_rounding_precision(status), + 0, zExp, zSig0, zSig1, status); +} + +#endif + +#ifdef FLOAT128 + +/*---------------------------------------------------------------------------- +| Returns the result of converting the extended double-precision floating- +| point value `a' to the quadruple-precision floating-point format. The +| conversion is performed according to the IEC/IEEE Standard for Binary +| Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float128 floatx80_to_float128(floatx80 a, float_status_t &status) +{ + uint64_t zSig0, zSig1; + + uint64_t aSig = extractFloatx80Frac(a); + int32_t aExp = extractFloatx80Exp(a); + int aSign = extractFloatx80Sign(a); + + if ((aExp == 0x7FFF) && (uint64_t) (aSig<<1)) + return commonNaNToFloat128(floatx80ToCommonNaN(a, status)); + + shift128Right(aSig<<1, 0, 16, &zSig0, &zSig1); + return packFloat128(aSign, aExp, zSig0, zSig1); +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the quadruple-precision floating-point +| value `a' to the extended double-precision floating-point format. The +| conversion is performed according to the IEC/IEEE Standard for Binary +| Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +floatx80 float128_to_floatx80(float128 a, float_status_t &status) +{ + int32_t aExp; + uint64_t aSig0, aSig1; + + aSig1 = extractFloat128Frac1(a); + aSig0 = extractFloat128Frac0(a); + aExp = extractFloat128Exp(a); + int aSign = extractFloat128Sign(a); + + if (aExp == 0x7FFF) { + if (aSig0 | aSig1) + return commonNaNToFloatx80(float128ToCommonNaN(a, status)); + + return packFloatx80(aSign, 0x7FFF, UINT64_C(0x8000000000000000)); + } + + if (aExp == 0) { + if ((aSig0 | aSig1) == 0) return packFloatx80(aSign, 0, 0); + float_raise(status, float_flag_denormal); + normalizeFloat128Subnormal(aSig0, aSig1, &aExp, &aSig0, &aSig1); + } + else aSig0 |= UINT64_C(0x0001000000000000); + + shortShift128Left(aSig0, aSig1, 15, &aSig0, &aSig1); + return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status); +} + +/*---------------------------------------------------------------------------- +| Returns the result of multiplying the extended double-precision floating- +| point value `a' and quadruple-precision floating point value `b'. The +| operation is performed according to the IEC/IEEE Standard for Binary +| Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +floatx80 floatx80_mul(floatx80 a, float128 b, float_status_t &status) +{ + int32_t aExp, bExp, zExp; + uint64_t aSig, bSig0, bSig1, zSig0, zSig1, zSig2; + int aSign, bSign, zSign; + + // handle unsupported extended double-precision floating encodings + if (floatx80_is_unsupported(a)) + { + invalid: + float_raise(status, float_flag_invalid); + return floatx80_default_nan; + } + + aSig = extractFloatx80Frac(a); + aExp = extractFloatx80Exp(a); + aSign = extractFloatx80Sign(a); + bSig0 = extractFloat128Frac0(b); + bSig1 = extractFloat128Frac1(b); + bExp = extractFloat128Exp(b); + bSign = extractFloat128Sign(b); + + zSign = aSign ^ bSign; + + if (aExp == 0x7FFF) { + if ((uint64_t) (aSig<<1) + || ((bExp == 0x7FFF) && (bSig0 | bSig1))) + { + floatx80 r = commonNaNToFloatx80(float128ToCommonNaN(b, status)); + return propagateFloatx80NaN(a, r, status); + } + if (bExp == 0) { + if ((bSig0 | bSig1) == 0) goto invalid; + float_raise(status, float_flag_denormal); + } + return packFloatx80(zSign, 0x7FFF, UINT64_C(0x8000000000000000)); + } + if (bExp == 0x7FFF) { + if (bSig0 | bSig1) { + floatx80 r = commonNaNToFloatx80(float128ToCommonNaN(b, status)); + return propagateFloatx80NaN(a, r, status); + } + if (aExp == 0) { + if (aSig == 0) goto invalid; + float_raise(status, float_flag_denormal); + } + return packFloatx80(zSign, 0x7FFF, UINT64_C(0x8000000000000000)); + } + if (aExp == 0) { + if (aSig == 0) { + if ((bExp == 0) && (bSig0 | bSig1)) float_raise(status, float_flag_denormal); + return packFloatx80(zSign, 0, 0); + } + float_raise(status, float_flag_denormal); + normalizeFloatx80Subnormal(aSig, &aExp, &aSig); + } + if (bExp == 0) { + if ((bSig0 | bSig1) == 0) return packFloatx80(zSign, 0, 0); + float_raise(status, float_flag_denormal); + normalizeFloat128Subnormal(bSig0, bSig1, &bExp, &bSig0, &bSig1); + } + else bSig0 |= UINT64_C(0x0001000000000000); + + zExp = aExp + bExp - 0x3FFE; + shortShift128Left(bSig0, bSig1, 15, &bSig0, &bSig1); + mul128By64To192(bSig0, bSig1, aSig, &zSig0, &zSig1, &zSig2); + if (0 < (int64_t) zSig0) { + shortShift128Left(zSig0, zSig1, 1, &zSig0, &zSig1); + --zExp; + } + return + roundAndPackFloatx80(get_float_rounding_precision(status), + zSign, zExp, zSig0, zSig1, status); +} + +/*---------------------------------------------------------------------------- +| Returns the result of adding the absolute values of the quadruple-precision +| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated +| before being returned. `zSign' is ignored if the result is a NaN. +| The addition is performed according to the IEC/IEEE Standard for Binary +| Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +static float128 addFloat128Sigs(float128 a, float128 b, int zSign, float_status_t &status) +{ + int32_t aExp, bExp, zExp; + uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; + int32_t expDiff; + + aSig1 = extractFloat128Frac1(a); + aSig0 = extractFloat128Frac0(a); + aExp = extractFloat128Exp(a); + bSig1 = extractFloat128Frac1(b); + bSig0 = extractFloat128Frac0(b); + bExp = extractFloat128Exp(b); + expDiff = aExp - bExp; + + if (0 < expDiff) { + if (aExp == 0x7FFF) { + if (aSig0 | aSig1) return propagateFloat128NaN(a, b, status); + return a; + } + if (bExp == 0) --expDiff; + else bSig0 |= UINT64_C(0x0001000000000000); + shift128ExtraRightJamming(bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2); + zExp = aExp; + } + else if (expDiff < 0) { + if (bExp == 0x7FFF) { + if (bSig0 | bSig1) return propagateFloat128NaN(a, b, status); + return packFloat128(zSign, 0x7FFF, 0, 0); + } + if (aExp == 0) ++expDiff; + else aSig0 |= UINT64_C(0x0001000000000000); + shift128ExtraRightJamming(aSig0, aSig1, 0, -expDiff, &aSig0, &aSig1, &zSig2); + zExp = bExp; + } + else { + if (aExp == 0x7FFF) { + if (aSig0 | aSig1 | bSig0 | bSig1) + return propagateFloat128NaN(a, b, status); + + return a; + } + add128(aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1); + if (aExp == 0) return packFloat128(zSign, 0, zSig0, zSig1); + zSig2 = 0; + zSig0 |= UINT64_C(0x0002000000000000); + zExp = aExp; + goto shiftRight1; + } + aSig0 |= UINT64_C(0x0001000000000000); + add128(aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1); + --zExp; + if (zSig0 < UINT64_C(0x0002000000000000)) goto roundAndPack; + ++zExp; + shiftRight1: + shift128ExtraRightJamming(zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2); + roundAndPack: + return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); +} + +/*---------------------------------------------------------------------------- +| Returns the result of subtracting the absolute values of the quadruple- +| precision floating-point values `a' and `b'. If `zSign' is 1, the +| difference is negated before being returned. `zSign' is ignored if the +| result is a NaN. The subtraction is performed according to the IEC/IEEE +| Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +static float128 subFloat128Sigs(float128 a, float128 b, int zSign, float_status_t &status) +{ + int32_t aExp, bExp, zExp; + uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1; + int32_t expDiff; + + aSig1 = extractFloat128Frac1(a); + aSig0 = extractFloat128Frac0(a); + aExp = extractFloat128Exp(a); + bSig1 = extractFloat128Frac1(b); + bSig0 = extractFloat128Frac0(b); + bExp = extractFloat128Exp(b); + + expDiff = aExp - bExp; + shortShift128Left(aSig0, aSig1, 14, &aSig0, &aSig1); + shortShift128Left(bSig0, bSig1, 14, &bSig0, &bSig1); + if (0 < expDiff) goto aExpBigger; + if (expDiff < 0) goto bExpBigger; + if (aExp == 0x7FFF) { + if (aSig0 | aSig1 | bSig0 | bSig1) + return propagateFloat128NaN(a, b, status); + + float_raise(status, float_flag_invalid); + return float128_default_nan; + } + if (aExp == 0) { + aExp = 1; + bExp = 1; + } + if (bSig0 < aSig0) goto aBigger; + if (aSig0 < bSig0) goto bBigger; + if (bSig1 < aSig1) goto aBigger; + if (aSig1 < bSig1) goto bBigger; + return packFloat128(0, 0); + + bExpBigger: + if (bExp == 0x7FFF) { + if (bSig0 | bSig1) return propagateFloat128NaN(a, b, status); + return packFloat128(zSign ^ 1, 0x7FFF, 0, 0); + } + if (aExp == 0) ++expDiff; + else { + aSig0 |= UINT64_C(0x4000000000000000); + } + shift128RightJamming(aSig0, aSig1, - expDiff, &aSig0, &aSig1); + bSig0 |= UINT64_C(0x4000000000000000); + bBigger: + sub128(bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1); + zExp = bExp; + zSign ^= 1; + goto normalizeRoundAndPack; + aExpBigger: + if (aExp == 0x7FFF) { + if (aSig0 | aSig1) return propagateFloat128NaN(a, b, status); + return a; + } + if (bExp == 0) --expDiff; + else { + bSig0 |= UINT64_C(0x4000000000000000); + } + shift128RightJamming(bSig0, bSig1, expDiff, &bSig0, &bSig1); + aSig0 |= UINT64_C(0x4000000000000000); + aBigger: + sub128(aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1); + zExp = aExp; + normalizeRoundAndPack: + --zExp; + return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1, status); +} + +/*---------------------------------------------------------------------------- +| Returns the result of adding the quadruple-precision floating-point values +| `a' and `b'. The operation is performed according to the IEC/IEEE Standard +| for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float128 float128_add(float128 a, float128 b, float_status_t &status) +{ + int aSign = extractFloat128Sign(a); + int bSign = extractFloat128Sign(b); + + if (aSign == bSign) { + return addFloat128Sigs(a, b, aSign, status); + } + else { + return subFloat128Sigs(a, b, aSign, status); + } +} + +/*---------------------------------------------------------------------------- +| Returns the result of subtracting the quadruple-precision floating-point +| values `a' and `b'. The operation is performed according to the IEC/IEEE +| Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float128 float128_sub(float128 a, float128 b, float_status_t &status) +{ + int aSign = extractFloat128Sign(a); + int bSign = extractFloat128Sign(b); + + if (aSign == bSign) { + return subFloat128Sigs(a, b, aSign, status); + } + else { + return addFloat128Sigs(a, b, aSign, status); + } +} + +/*---------------------------------------------------------------------------- +| Returns the result of multiplying the quadruple-precision floating-point +| values `a' and `b'. The operation is performed according to the IEC/IEEE +| Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float128 float128_mul(float128 a, float128 b, float_status_t &status) +{ + int aSign, bSign, zSign; + int32_t aExp, bExp, zExp; + uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3; + + aSig1 = extractFloat128Frac1(a); + aSig0 = extractFloat128Frac0(a); + aExp = extractFloat128Exp(a); + aSign = extractFloat128Sign(a); + bSig1 = extractFloat128Frac1(b); + bSig0 = extractFloat128Frac0(b); + bExp = extractFloat128Exp(b); + bSign = extractFloat128Sign(b); + + zSign = aSign ^ bSign; + if (aExp == 0x7FFF) { + if ((aSig0 | aSig1) || ((bExp == 0x7FFF) && (bSig0 | bSig1))) { + return propagateFloat128NaN(a, b, status); + } + if ((bExp | bSig0 | bSig1) == 0) { + float_raise(status, float_flag_invalid); + return float128_default_nan; + } + return packFloat128(zSign, 0x7FFF, 0, 0); + } + if (bExp == 0x7FFF) { + if (bSig0 | bSig1) return propagateFloat128NaN(a, b, status); + if ((aExp | aSig0 | aSig1) == 0) { + float_raise(status, float_flag_invalid); + return float128_default_nan; + } + return packFloat128(zSign, 0x7FFF, 0, 0); + } + if (aExp == 0) { + if ((aSig0 | aSig1) == 0) return packFloat128(zSign, 0, 0, 0); + float_raise(status, float_flag_denormal); + normalizeFloat128Subnormal(aSig0, aSig1, &aExp, &aSig0, &aSig1); + } + if (bExp == 0) { + if ((bSig0 | bSig1) == 0) return packFloat128(zSign, 0, 0, 0); + float_raise(status, float_flag_denormal); + normalizeFloat128Subnormal(bSig0, bSig1, &bExp, &bSig0, &bSig1); + } + zExp = aExp + bExp - 0x4000; + aSig0 |= UINT64_C(0x0001000000000000); + shortShift128Left(bSig0, bSig1, 16, &bSig0, &bSig1); + mul128To256(aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3); + add128(zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1); + zSig2 |= (zSig3 != 0); + if (UINT64_C(0x0002000000000000) <= zSig0) { + shift128ExtraRightJamming(zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2); + ++zExp; + } + return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); +} + +/*---------------------------------------------------------------------------- +| Returns the result of dividing the quadruple-precision floating-point value +| `a' by the corresponding value `b'. The operation is performed according to +| the IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float128 float128_div(float128 a, float128 b, float_status_t &status) +{ + int aSign, bSign, zSign; + int32_t aExp, bExp, zExp; + uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; + uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; + + aSig1 = extractFloat128Frac1(a); + aSig0 = extractFloat128Frac0(a); + aExp = extractFloat128Exp(a); + aSign = extractFloat128Sign(a); + bSig1 = extractFloat128Frac1(b); + bSig0 = extractFloat128Frac0(b); + bExp = extractFloat128Exp(b); + bSign = extractFloat128Sign(b); + + zSign = aSign ^ bSign; + if (aExp == 0x7FFF) { + if (aSig0 | aSig1) return propagateFloat128NaN(a, b, status); + if (bExp == 0x7FFF) { + if (bSig0 | bSig1) return propagateFloat128NaN(a, b, status); + float_raise(status, float_flag_invalid); + return float128_default_nan; + } + return packFloat128(zSign, 0x7FFF, 0, 0); + } + if (bExp == 0x7FFF) { + if (bSig0 | bSig1) return propagateFloat128NaN(a, b, status); + return packFloat128(zSign, 0, 0, 0); + } + if (bExp == 0) { + if ((bSig0 | bSig1) == 0) { + if ((aExp | aSig0 | aSig1) == 0) { + float_raise(status, float_flag_invalid); + return float128_default_nan; + } + float_raise(status, float_flag_divbyzero); + return packFloat128(zSign, 0x7FFF, 0, 0); + } + float_raise(status, float_flag_denormal); + normalizeFloat128Subnormal(bSig0, bSig1, &bExp, &bSig0, &bSig1); + } + if (aExp == 0) { + if ((aSig0 | aSig1) == 0) return packFloat128(zSign, 0, 0, 0); + float_raise(status, float_flag_denormal); + normalizeFloat128Subnormal(aSig0, aSig1, &aExp, &aSig0, &aSig1); + } + zExp = aExp - bExp + 0x3FFD; + shortShift128Left( + aSig0 | UINT64_C(0x0001000000000000), aSig1, 15, &aSig0, &aSig1); + shortShift128Left( + bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1); + if (le128(bSig0, bSig1, aSig0, aSig1)) { + shift128Right(aSig0, aSig1, 1, &aSig0, &aSig1); + ++zExp; + } + zSig0 = estimateDiv128To64(aSig0, aSig1, bSig0); + mul128By64To192(bSig0, bSig1, zSig0, &term0, &term1, &term2); + sub192(aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2); + while ((int64_t) rem0 < 0) { + --zSig0; + add192(rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2); + } + zSig1 = estimateDiv128To64(rem1, rem2, bSig0); + if ((zSig1 & 0x3FFF) <= 4) { + mul128By64To192(bSig0, bSig1, zSig1, &term1, &term2, &term3); + sub192(rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3); + while ((int64_t) rem1 < 0) { + --zSig1; + add192(rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3); + } + zSig1 |= ((rem1 | rem2 | rem3) != 0); + } + shift128ExtraRightJamming(zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2); + return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the 64-bit two's complement integer `a' to +| the quadruple-precision floating-point format. The conversion is performed +| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float128 int64_to_float128(int64_t a) +{ + uint64_t zSig0, zSig1; + + if (a == 0) return packFloat128(0, 0, 0, 0); + int zSign = (a < 0); + uint64_t absA = zSign ? - a : a; + uint8_t shiftCount = countLeadingZeros64(absA) + 49; + int32_t zExp = 0x406E - shiftCount; + if (64 <= shiftCount) { + zSig1 = 0; + zSig0 = absA; + shiftCount -= 64; + } + else { + zSig1 = absA; + zSig0 = 0; + } + shortShift128Left(zSig0, zSig1, shiftCount, &zSig0, &zSig1); + return packFloat128(zSign, zExp, zSig0, zSig1); +} + +END_SOFTFLOAT_NS + +#endif diff -r 372d3611c693 -r e89a7cf22da3 ext/softfloat/softfloat16.cc --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ext/softfloat/softfloat16.cc Thu Sep 19 18:09:15 2013 +0200 @@ -0,0 +1,133 @@ +/*============================================================================ +This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic +Package, Release 2b. + +Written by John R. Hauser. This work was made possible in part by the +International Computer Science Institute, located at Suite 600, 1947 Center +Street, Berkeley, California 94704. Funding was partially provided by the +National Science Foundation under grant MIP-9311980. The original version +of this code was written as part of a project to build a fixed-point vector +processor in collaboration with the University of California at Berkeley, +overseen by Profs. Nelson Morgan and John Wawrzynek. More information +is available through the Web page `http://www.cs.berkeley.edu/~jhauser/ +arithmetic/SoftFloat.html'. + +THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has +been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES +RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS +AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES, +COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE +EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE +INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR +OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE. + +Derivative works are acceptable, even for commercial purposes, so long as +(1) the source code for the derivative work includes prominent notice that +the work is derivative, and (2) the source code includes prominent notice with +these four paragraphs for those parts of this code that are retained. +=============================================================================*/ + +/*============================================================================ + * Adapted for Bochs (x86 achitecture simulator) by + * Stanislav Shwartsman [sshwarts at sourceforge net] + * ==========================================================================*/ + +#include "softfloat.hh" + +#ifdef FLOAT16 + +#include "softfloat-round-pack.hh" +#include "softfloat-specialize.hh" +#include "softfloat-macros.hh" + +BEGIN_SOFTFLOAT_NS + +/*---------------------------------------------------------------------------- +| Determine half-precision floating-point number class +*----------------------------------------------------------------------------*/ + +float_class_t float16_class(float16 a) +{ + int16_t aExp = extractFloat16Exp(a); + uint16_t aSig = extractFloat16Frac(a); + int aSign = extractFloat16Sign(a); + + if(aExp == 0x1F) { + if (aSig == 0) + return (aSign) ? float_negative_inf : float_positive_inf; + + return float_NaN; + } + + if(aExp == 0) { + if (aSig == 0) return float_zero; + return float_denormal; + } + + return float_normalized; +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the half-precision floating-point value +| `a' to the single-precision floating-point format. The conversion is +| performed according to the IEC/IEEE Standard for Binary Floating-Point +| Arithmetic. +*----------------------------------------------------------------------------*/ + +float32 float16_to_float32(float16 a, float_status_t &status) +{ + uint16_t aSig = extractFloat16Frac(a); + int16_t aExp = extractFloat16Exp(a); + int aSign = extractFloat16Sign(a); + + if (aExp == 0x1F) { + if (aSig) return commonNaNToFloat32(float16ToCommonNaN(a, status)); + return packFloat32(aSign, 0xFF, 0); + } + if (aExp == 0) { + // ignore denormals_are_zeros flag + if (aSig == 0) return packFloat32(aSign, 0, 0); + float_raise(status, float_flag_denormal); + normalizeFloat16Subnormal(aSig, &aExp, &aSig); + --aExp; + } + + return packFloat32(aSign, aExp + 0x70, ((uint32_t) aSig)<<13); +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the single-precision floating-point value +| `a' to the half-precision floating-point format. The conversion is +| performed according to the IEC/IEEE Standard for Binary Floating-Point +| Arithmetic. +*----------------------------------------------------------------------------*/ + +float16 float32_to_float16(float32 a, float_status_t &status) +{ + uint32_t aSig = extractFloat32Frac(a); + int16_t aExp = extractFloat32Exp(a); + int aSign = extractFloat32Sign(a); + + if (aExp == 0xFF) { + if (aSig) return commonNaNToFloat16(float32ToCommonNaN(a, status)); + return packFloat16(aSign, 0x1F, 0); + } + if (aExp == 0) { + if (get_denormals_are_zeros(status)) aSig = 0; + if (aSig == 0) return packFloat16(aSign, 0, 0); + float_raise(status, float_flag_denormal); + } + + aSig = shift32RightJamming(aSig, 9); + uint16_t zSig = (uint16_t) aSig; + if (aExp || zSig) { + zSig |= 0x4000; + aExp -= 0x71; + } + + return roundAndPackFloat16(aSign, aExp, zSig, status); +} + +END_SOFTFLOAT_NS + +#endif diff -r 372d3611c693 -r e89a7cf22da3 ext/softfloat/softfloat_ns.hh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ext/softfloat/softfloat_ns.hh Thu Sep 19 18:09:15 2013 +0200 @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2013 Andreas Sandberg + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Authors: Andreas Sandberg + */ + +#ifndef _SOFTFLOAT_NS_HH_ +#define _SOFTFLOAT_NS_HH_ + +#define USING_SOFTFLOAT_NS using namespace SoftFloat +#define BEGIN_SOFTFLOAT_NS namespace SoftFloat { +#define END_SOFTFLOAT_NS } + +#endif diff -r 372d3611c693 -r e89a7cf22da3 ext/softfloat/softfloatx80.hh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ext/softfloat/softfloatx80.hh Thu Sep 19 18:09:15 2013 +0200 @@ -0,0 +1,104 @@ +/*============================================================================ +This source file is an extension to the SoftFloat IEC/IEEE Floating-point +Arithmetic Package, Release 2b, written for Bochs (x86 achitecture simulator) +floating point emulation. + +THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has +been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES +RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS +AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES, +COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE +EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE +INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR +OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE. + +Derivative works are acceptable, even for commercial purposes, so long as +(1) the source code for the derivative work includes prominent notice that +the work is derivative, and (2) the source code includes prominent notice with +these four paragraphs for those parts of this code that are retained. +=============================================================================*/ + +/*============================================================================ + * Written for Bochs (x86 achitecture simulator) by + * Stanislav Shwartsman [sshwarts at sourceforge net] + * ==========================================================================*/ + +#ifndef _SOFTFLOATX80_EXTENSIONS_H_ +#define _SOFTFLOATX80_EXTENSIONS_H_ + +#include "softfloat.hh" +#include "softfloat-specialize.hh" + +BEGIN_SOFTFLOAT_NS + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE integer-to-floating-point conversion routines. +*----------------------------------------------------------------------------*/ + +int16_t floatx80_to_int16(floatx80, float_status_t &status); +int16_t floatx80_to_int16_round_to_zero(floatx80, float_status_t &status); + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE extended double-precision operations. +*----------------------------------------------------------------------------*/ + +float_class_t floatx80_class(floatx80); +floatx80 floatx80_extract(floatx80 &a, float_status_t &status); +floatx80 floatx80_scale(floatx80 a, floatx80 b, float_status_t &status); +int floatx80_remainder(floatx80 a, floatx80 b, floatx80 &r, uint64_t &q, float_status_t &status); +int floatx80_ieee754_remainder(floatx80 a, floatx80 b, floatx80 &r, uint64_t &q, float_status_t &status); +floatx80 f2xm1(floatx80 a, float_status_t &status); +floatx80 fyl2x(floatx80 a, floatx80 b, float_status_t &status); +floatx80 fyl2xp1(floatx80 a, floatx80 b, float_status_t &status); +floatx80 fpatan(floatx80 a, floatx80 b, float_status_t &status); + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE extended double-precision trigonometric functions. +*----------------------------------------------------------------------------*/ + +int fsincos(floatx80 a, floatx80 *sin_a, floatx80 *cos_a, float_status_t &status); +int fsin(floatx80 &a, float_status_t &status); +int fcos(floatx80 &a, float_status_t &status); +int ftan(floatx80 &a, float_status_t &status); + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE extended double-precision compare. +*----------------------------------------------------------------------------*/ + +int floatx80_compare(floatx80, floatx80, float_status_t &status); +int floatx80_compare_quiet(floatx80, floatx80, float_status_t &status); + +/*----------------------------------------------------------------------------- +| Calculates the absolute value of the extended double-precision floating-point +| value `a'. The operation is performed according to the IEC/IEEE Standard +| for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +inline floatx80& floatx80_abs(floatx80 ®) +{ + reg.exp &= 0x7FFF; + return reg; +} + +/*----------------------------------------------------------------------------- +| Changes the sign of the extended double-precision floating-point value 'a'. +| The operation is performed according to the IEC/IEEE Standard for Binary +| Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +inline floatx80& floatx80_chs(floatx80 ®) +{ + reg.exp ^= 0x8000; + return reg; +} + +/*----------------------------------------------------------------------------- +| Commonly used extended double-precision floating-point constants. +*----------------------------------------------------------------------------*/ + +extern const floatx80 Const_Z; +extern const floatx80 Const_1; + +END_SOFTFLOAT_NS + +#endif diff -r 372d3611c693 -r e89a7cf22da3 ext/softfloat/softfloatx80.cc --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ext/softfloat/softfloatx80.cc Thu Sep 19 18:09:15 2013 +0200 @@ -0,0 +1,367 @@ +/*============================================================================ +This source file is an extension to the SoftFloat IEC/IEEE Floating-point +Arithmetic Package, Release 2b, written for Bochs (x86 achitecture simulator) +floating point emulation. + +THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has +been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES +RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS +AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES, +COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE +EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE +INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR +OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE. + +Derivative works are acceptable, even for commercial purposes, so long as +(1) the source code for the derivative work includes prominent notice that +the work is derivative, and (2) the source code includes prominent notice with +these four paragraphs for those parts of this code that are retained. +=============================================================================*/ + +/*============================================================================ + * Written for Bochs (x86 achitecture simulator) by + * Stanislav Shwartsman [sshwarts at sourceforge net] + * ==========================================================================*/ + +#include "softfloatx80.hh" +#include "softfloat-round-pack.hh" +#include "softfloat-macros.hh" + +BEGIN_SOFTFLOAT_NS + +/*---------------------------------------------------------------------------- +| Returns the result of converting the extended double-precision floating- +| point value `a' to the 16-bit two's complement integer format. The +| conversion is performed according to the IEC/IEEE Standard for Binary +| Floating-Point Arithmetic - which means in particular that the conversion +| is rounded according to the current rounding mode. If `a' is a NaN or the +| conversion overflows, the integer indefinite value is returned. +*----------------------------------------------------------------------------*/ + +int16_t floatx80_to_int16(floatx80 a, float_status_t &status) +{ + if (floatx80_is_unsupported(a)) + { + float_raise(status, float_flag_invalid); + return int16_indefinite; + } + + int32_t v32 = floatx80_to_int32(a, status); + + if ((v32 > 32767) || (v32 < -32768)) { + status.float_exception_flags = float_flag_invalid; // throw way other flags + return int16_indefinite; + } + + return (int16_t) v32; +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the extended double-precision floating- +| point value `a' to the 16-bit two's complement integer format. The +| conversion is performed according to the IEC/IEEE Standard for Binary +| Floating-Point Arithmetic, except that the conversion is always rounded +| toward zero. If `a' is a NaN or the conversion overflows, the integer +| indefinite value is returned. +*----------------------------------------------------------------------------*/ + +int16_t floatx80_to_int16_round_to_zero(floatx80 a, float_status_t &status) +{ + if (floatx80_is_unsupported(a)) + { + float_raise(status, float_flag_invalid); + return int16_indefinite; + } + + int32_t v32 = floatx80_to_int32_round_to_zero(a, status); + + if ((v32 > 32767) || (v32 < -32768)) { + status.float_exception_flags = float_flag_invalid; // throw way other flags + return int16_indefinite; + } + + return (int16_t) v32; +} + +/*---------------------------------------------------------------------------- +| Separate the source extended double-precision floating point value `a' +| into its exponent and significand, store the significant back to the +| 'a' and return the exponent. The operation performed is a superset of +| the IEC/IEEE recommended logb(x) function. +*----------------------------------------------------------------------------*/ + +floatx80 floatx80_extract(floatx80 &a, float_status_t &status) +{ + uint64_t aSig = extractFloatx80Frac(a); + int32_t aExp = extractFloatx80Exp(a); + int aSign = extractFloatx80Sign(a); + + if (floatx80_is_unsupported(a)) + { + float_raise(status, float_flag_invalid); + a = floatx80_default_nan; + return a; + } + + if (aExp == 0x7FFF) { + if ((uint64_t) (aSig<<1)) + { + a = propagateFloatx80NaN(a, status); + return a; + } + return packFloatx80(0, 0x7FFF, UINT64_C(0x8000000000000000)); + } + if (aExp == 0) + { + if (aSig == 0) { + float_raise(status, float_flag_divbyzero); + a = packFloatx80(aSign, 0, 0); + return packFloatx80(1, 0x7FFF, UINT64_C(0x8000000000000000)); + } + float_raise(status, float_flag_denormal); + normalizeFloatx80Subnormal(aSig, &aExp, &aSig); + } + + a.exp = (aSign << 15) + 0x3FFF; + a.fraction = aSig; + return int32_to_floatx80(aExp - 0x3FFF); +} + +/*---------------------------------------------------------------------------- +| Scales extended double-precision floating-point value in operand `a' by +| value `b'. The function truncates the value in the second operand 'b' to +| an integral value and adds that value to the exponent of the operand 'a'. +| The operation performed according to the IEC/IEEE Standard for Binary +| Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +floatx80 floatx80_scale(floatx80 a, floatx80 b, float_status_t &status) +{ + int32_t aExp, bExp; + uint64_t aSig, bSig; + + // handle unsupported extended double-precision floating encodings + if (floatx80_is_unsupported(a) || floatx80_is_unsupported(b)) + { + float_raise(status, float_flag_invalid); + return floatx80_default_nan; + } + + aSig = extractFloatx80Frac(a); + aExp = extractFloatx80Exp(a); + int aSign = extractFloatx80Sign(a); + bSig = extractFloatx80Frac(b); + bExp = extractFloatx80Exp(b); + int bSign = extractFloatx80Sign(b); + + if (aExp == 0x7FFF) { + if ((uint64_t) (aSig<<1) || ((bExp == 0x7FFF) && (uint64_t) (bSig<<1))) + { + return propagateFloatx80NaN(a, b, status); + } + if ((bExp == 0x7FFF) && bSign) { + float_raise(status, float_flag_invalid); + return floatx80_default_nan; + } + if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal); + return a; + } + if (bExp == 0x7FFF) { + if ((uint64_t) (bSig<<1)) return propagateFloatx80NaN(a, b, status); + if ((aExp | aSig) == 0) { + if (! bSign) { + float_raise(status, float_flag_invalid); + return floatx80_default_nan; + } + return a; + } + if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal); + if (bSign) return packFloatx80(aSign, 0, 0); + return packFloatx80(aSign, 0x7FFF, UINT64_C(0x8000000000000000)); + } + if (aExp == 0) { + if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal); + if (aSig == 0) return a; + float_raise(status, float_flag_denormal); + normalizeFloatx80Subnormal(aSig, &aExp, &aSig); + if (bExp < 0x3FFF) + return normalizeRoundAndPackFloatx80(80, aSign, aExp, aSig, 0, status); + } + if (bExp == 0) { + if (bSig == 0) return a; + float_raise(status, float_flag_denormal); + normalizeFloatx80Subnormal(bSig, &bExp, &bSig); + } + + if (bExp > 0x400E) { + /* generate appropriate overflow/underflow */ + return roundAndPackFloatx80(80, aSign, + bSign ? -0x3FFF : 0x7FFF, aSig, 0, status); + } + + if (bExp < 0x3FFF) return a; + + int shiftCount = 0x403E - bExp; + bSig >>= shiftCount; + int32_t scale = (int32_t) bSig; + if (bSign) scale = -scale; /* -32768..32767 */ + return + roundAndPackFloatx80(80, aSign, aExp+scale, aSig, 0, status); +} + +/*---------------------------------------------------------------------------- +| Determine extended-precision floating-point number class. +*----------------------------------------------------------------------------*/ + +float_class_t floatx80_class(floatx80 a) +{ + int32_t aExp = extractFloatx80Exp(a); + uint64_t aSig = extractFloatx80Frac(a); + + if(aExp == 0) { + if (aSig == 0) + return float_zero; + + /* denormal or pseudo-denormal */ + return float_denormal; + } + + /* valid numbers have the MS bit set */ + if (!(aSig & UINT64_C(0x8000000000000000))) + return float_NaN; /* report unsupported as NaNs */ + + if(aExp == 0x7fff) { + int aSign = extractFloatx80Sign(a); + + if (((uint64_t) (aSig<< 1)) == 0) + return (aSign) ? float_negative_inf : float_positive_inf; + + return float_NaN; + } + + return float_normalized; +} + +/*---------------------------------------------------------------------------- +| Compare between two extended precision floating point numbers. Returns +| 'float_relation_equal' if the operands are equal, 'float_relation_less' if +| the value 'a' is less than the corresponding value `b', +| 'float_relation_greater' if the value 'a' is greater than the corresponding +| value `b', or 'float_relation_unordered' otherwise. +*----------------------------------------------------------------------------*/ + +int floatx80_compare(floatx80 a, floatx80 b, float_status_t &status) +{ + float_class_t aClass = floatx80_class(a); + float_class_t bClass = floatx80_class(b); + + if (aClass == float_NaN || bClass == float_NaN) + { + float_raise(status, float_flag_invalid); + return float_relation_unordered; + } + + if (aClass == float_denormal || bClass == float_denormal) + { + float_raise(status, float_flag_denormal); + } + + int aSign = extractFloatx80Sign(a); + int bSign = extractFloatx80Sign(b); + + if (aClass == float_zero) { + if (bClass == float_zero) return float_relation_equal; + return bSign ? float_relation_greater : float_relation_less; + } + + if (bClass == float_zero || aSign != bSign) { + return aSign ? float_relation_less : float_relation_greater; + } + + uint64_t aSig = extractFloatx80Frac(a); + int32_t aExp = extractFloatx80Exp(a); + uint64_t bSig = extractFloatx80Frac(b); + int32_t bExp = extractFloatx80Exp(b); + + if (aClass == float_denormal) + normalizeFloatx80Subnormal(aSig, &aExp, &aSig); + + if (bClass == float_denormal) + normalizeFloatx80Subnormal(bSig, &bExp, &bSig); + + if (aExp == bExp && aSig == bSig) + return float_relation_equal; + + int less_than = + aSign ? ((bExp < aExp) || ((bExp == aExp) && (bSig < aSig))) + : ((aExp < bExp) || ((aExp == bExp) && (aSig < bSig))); + + if (less_than) return float_relation_less; + return float_relation_greater; +} + +/*---------------------------------------------------------------------------- +| Compare between two extended precision floating point numbers. Returns +| 'float_relation_equal' if the operands are equal, 'float_relation_less' if +| the value 'a' is less than the corresponding value `b', +| 'float_relation_greater' if the value 'a' is greater than the corresponding +| value `b', or 'float_relation_unordered' otherwise. Quiet NaNs do not cause +| an exception. +*----------------------------------------------------------------------------*/ + +int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status_t &status) +{ + float_class_t aClass = floatx80_class(a); + float_class_t bClass = floatx80_class(b); + + if (aClass == float_NaN || bClass == float_NaN) + { + if (floatx80_is_unsupported(a) || floatx80_is_unsupported(b)) + float_raise(status, float_flag_invalid); + + if (floatx80_is_signaling_nan(a) || floatx80_is_signaling_nan(b)) + float_raise(status, float_flag_invalid); + + return float_relation_unordered; + } + + if (aClass == float_denormal || bClass == float_denormal) + { + float_raise(status, float_flag_denormal); + } + + int aSign = extractFloatx80Sign(a); + int bSign = extractFloatx80Sign(b); + + if (aClass == float_zero) { + if (bClass == float_zero) return float_relation_equal; + return bSign ? float_relation_greater : float_relation_less; + } + + if (bClass == float_zero || aSign != bSign) { + return aSign ? float_relation_less : float_relation_greater; + } + + uint64_t aSig = extractFloatx80Frac(a); + int32_t aExp = extractFloatx80Exp(a); + uint64_t bSig = extractFloatx80Frac(b); + int32_t bExp = extractFloatx80Exp(b); + + if (aClass == float_denormal) + normalizeFloatx80Subnormal(aSig, &aExp, &aSig); + + if (bClass == float_denormal) + normalizeFloatx80Subnormal(bSig, &bExp, &bSig); + + if (aExp == bExp && aSig == bSig) + return float_relation_equal; + + int less_than = + aSign ? ((bExp < aExp) || ((bExp == aExp) && (bSig < aSig))) + : ((aExp < bExp) || ((aExp == bExp) && (aSig < bSig))); + + if (less_than) return float_relation_less; + return float_relation_greater; +} + +END_SOFTFLOAT_NS