diff --git a/chunker/chunker.go b/chunker/chunker.go index d69579724..75ae1a1e0 100644 --- a/chunker/chunker.go +++ b/chunker/chunker.go @@ -10,10 +10,6 @@ const ( KiB = 1024 MiB = 1024 * KiB - // Polynomial is a randomly generated irreducible polynomial of degree 53 - // in Z_2[X]. All rabin fingerprints are calculated with this polynomial. - Polynomial = 0x3DA3358B4DC173 - // WindowSize is the size of the sliding window. WindowSize = 64 @@ -28,10 +24,17 @@ const ( ) var ( - pol_shift = deg(Polynomial) - 8 + // pol is a randomly generated irreducible polynomial of degree 53 + // in Z_2[X]. All rabin fingerprints are calculated with this polynomial. + pol = uint64(0x3DA3358B4DC173) + + pol_shift = deg(pol) - 8 once sync.Once mod_table [256]uint64 out_table [256]uint64 + + // tables have been filled, do not allow changing the polynom afterwards + filled bool ) // A chunk is one content-dependent chunk of bytes whose end was cut when the @@ -69,6 +72,16 @@ type Chunker struct { h hash.Hash } +// Polynomial sets the polynomial that is to be used for calculating the rabin +// fingerprints. This function must be called before the first chunker is +// created, otherwise the results are undefined. +func SetPolynomial(f uint64) { + if filled { + panic("polynomial changed after chunker has already been used") + } + pol = f +} + // New returns a new Chunker that reads from data from rd with bufsize and pass // all data to hash along the way. func New(rd io.Reader, bufsize int, hash hash.Hash) *Chunker { @@ -109,6 +122,8 @@ func (c *Chunker) Reset(rd io.Reader) { // Calculate out_table and mod_table for optimization. Must be called only once. func fill_tables() { + filled = true + // calculate table for sliding out bytes. The byte to slide out is used as // the index for the table, the value contains the following: // out_table[b] = Hash(b || 0 || ... || 0) @@ -123,15 +138,15 @@ func fill_tables() { for b := 0; b < 256; b++ { var hash uint64 - hash = append_byte(hash, byte(b), Polynomial) + hash = append_byte(hash, byte(b), pol) for i := 0; i < WindowSize-1; i++ { - hash = append_byte(hash, 0, Polynomial) + hash = append_byte(hash, 0, pol) } out_table[b] = hash } // calculate table for reduction mod Polynomial - k := deg(Polynomial) + k := deg(pol) for b := 0; b < 256; b++ { // mod_table[b] = A | B, where A = (b(x) * x^k mod pol) and B = b(x) * x^k // @@ -140,7 +155,7 @@ func fill_tables() { // two parts: Part A contains the result of the modulus operation, part // B is used to cancel out the 8 top bits so that one XOR operation is // enough to reduce modulo Polynomial - mod_table[b] = mod(uint64(b)<= deg(p) { shift := uint(deg(x) - deg(p)) diff --git a/chunker/doc.go b/chunker/doc.go index 428ee29f2..5537c172c 100644 --- a/chunker/doc.go +++ b/chunker/doc.go @@ -6,6 +6,59 @@ Package chunker implements Content Defined Chunking (CDC) based on a rolling Rabin Checksum. +Choosing a Random Irreducible Polynomial + +The function RandomPolynomial() returns a new random polynomial of degree 53 +for use with the chunker. The degree 53 is chosen because it is the largest +prime below 64-8 = 56, so that the top 8 bits of an uint64 can be used for +optimising calculations in the chunker. + +A random polynomial is chosen selecting 64 random bits, masking away bits +64..54 and setting bit 53 to one (otherwise the polynomial is not of the +desired degree) and bit 0 to one (otherwise the polynomial is trivially +reducible), so that 51 bits are chosen at random. + +This process is repeated until Irreducible() returns true, then this +polynomials is returned. If this doesn't happen after 1 million tries, the +function returns an error. The probability for selecting an irreducible +polynomial at random is about 7.5% ( (2^53-2)/53 / 2^51), so the probability +that no irreducible polynomial has been found after 100 tries is lower than +0.04%. + +Verifying Irreducible Polynomials + +During development the results have been verified using the computational +discrete algebra system GAP, which can be obtained from the website at +http://www.gap-system.org/. + +For filtering a given list of polynomials in hexadecimal coefficient notation, +the following script can be used: + + # create x over F_2 = GF(2) + x := Indeterminate(GF(2), "x"); + + # test if polynomial is irreducible, i.e. the number of factors is one + IrredPoly := function (poly) + return (Length(Factors(poly)) = 1); + end;; + + # create a polynomial in x from the hexadecimal representation of the + # coefficients + Hex2Poly := function (s) + return ValuePol(CoefficientsQadic(IntHexString(s), 2), x); + end;; + + # list of candidates, in hex + candidates := [ "3DA3358B4DC173" ]; + + # create real polynomials + L := List(candidates, Hex2Poly); + + # filter and display the list of irreducible polynomials contained in L + Display(Filtered(L, x -> (IrredPoly(x)))); + +All irreducible polynomials from the list are written to the output. + Background Literature An introduction to Rabin Fingerprints/Checksums can be found in the following articles: @@ -19,6 +72,9 @@ http://www.zlib.net/crc_v3.txt Andrei Z. Broder (1993): "Some Applications of Rabin's Fingerprinting Method" http://www.xmailserver.org/rabin_apps.pdf +Shuhong Gao and Daniel Panario (1997): "Tests and Constructions of Irreducible Polynomials over Finite Fields" +http://www.math.clemson.edu/~sgao/papers/GP97a.pdf + Andrew Kadatch, Bob Jenkins (2007): "Everything we know about CRC but afraid to forget" http://crcutil.googlecode.com/files/crc-doc.1.0.pdf diff --git a/chunker/generic_test.go b/chunker/generic_test.go new file mode 100644 index 000000000..88e799fc7 --- /dev/null +++ b/chunker/generic_test.go @@ -0,0 +1,36 @@ +package chunker_test + +import ( + "fmt" + "path/filepath" + "reflect" + "runtime" + "testing" +) + +// assert fails the test if the condition is false. +func assert(tb testing.TB, condition bool, msg string, v ...interface{}) { + if !condition { + _, file, line, _ := runtime.Caller(1) + fmt.Printf("\033[31m%s:%d: "+msg+"\033[39m\n\n", append([]interface{}{filepath.Base(file), line}, v...)...) + tb.FailNow() + } +} + +// ok fails the test if an err is not nil. +func ok(tb testing.TB, err error) { + if err != nil { + _, file, line, _ := runtime.Caller(1) + fmt.Printf("\033[31m%s:%d: unexpected error: %s\033[39m\n\n", filepath.Base(file), line, err.Error()) + tb.FailNow() + } +} + +// equals fails the test if exp is not equal to act. +func equals(tb testing.TB, exp, act interface{}) { + if !reflect.DeepEqual(exp, act) { + _, file, line, _ := runtime.Caller(1) + fmt.Printf("\033[31m%s:%d:\n\n\texp: %#v\n\n\tgot: %#v\033[39m\n\n", filepath.Base(file), line, exp, act) + tb.FailNow() + } +} diff --git a/chunker/polynomials.go b/chunker/polynomials.go new file mode 100644 index 000000000..4961f39b5 --- /dev/null +++ b/chunker/polynomials.go @@ -0,0 +1,257 @@ +package chunker + +import ( + "crypto/rand" + "encoding/binary" + "errors" + "fmt" + "strconv" +) + +// Pol is a polynomial from F_2[X]. +type Pol uint64 + +// Add returns x+y. +func (x Pol) Add(y Pol) Pol { + r := Pol(uint64(x) ^ uint64(y)) + return r +} + +// mulOverflows returns true if the multiplication would overflow uint64. +// Code by Rob Pike, see +// https://groups.google.com/d/msg/golang-nuts/h5oSN5t3Au4/KaNQREhZh0QJ +func mulOverflows(a, b Pol) bool { + if a <= 1 || b <= 1 { + return false + } + c := a.mul(b) + d := c.Div(b) + if d != a { + return true + } + + return false +} + +func (x Pol) mul(y Pol) Pol { + if x == 0 || y == 0 { + return 0 + } + + var res Pol + for i := 0; i <= y.Deg(); i++ { + if (y & (1 << uint(i))) > 0 { + res = res.Add(x << uint(i)) + } + } + + return res +} + +// Mul returns x*y. When an overflow occurs, Mul panics. +func (x Pol) Mul(y Pol) Pol { + if mulOverflows(x, y) { + panic("multiplication would overflow uint64") + } + + return x.mul(y) +} + +// Deg returns the degree of the polynomial x. If x is zero, -1 is returned. +func (x Pol) Deg() int { + // the degree of 0 is -1 + if x == 0 { + return -1 + } + + for i := 63; i >= 0; i-- { + // test if bit i is set + if x&(1< 0 { + // this is the degree of x + return i + } + } + + // fall-through, return -1 + return -1 +} + +// String returns the coefficients in hex. +func (x Pol) String() string { + return "0x" + strconv.FormatUint(uint64(x), 16) +} + +// Expand returns the string representation of the polynomial x. +func (x Pol) Expand() string { + if x == 0 { + return "0" + } + + s := "" + for i := x.Deg(); i > 1; i-- { + if x&(1< 0 { + s += fmt.Sprintf("+x^%d", i) + } + } + + if x&2 > 0 { + s += "+x" + } + + if x&1 > 0 { + s += "+1" + } + + return s[1:] +} + +// DivMod returns x / d = q, and remainder r, +// see https://en.wikipedia.org/wiki/Division_algorithm +func (x Pol) DivMod(d Pol) (Pol, Pol) { + if x == 0 { + return 0, 0 + } + + if d == 0 { + panic("division by zero") + } + + D := d.Deg() + diff := x.Deg() - D + if diff < 0 { + return 0, x + } + + var q Pol + for diff >= 0 { + m := d << uint(diff) + q |= (1 << uint(diff)) + x = x.Add(m) + + diff = x.Deg() - D + } + + return q, x +} + +// Div returns the integer division result x / d. +func (x Pol) Div(d Pol) Pol { + q, _ := x.DivMod(d) + return q +} + +// Mod returns the remainder of x / d +func (x Pol) Mod(d Pol) Pol { + _, r := x.DivMod(d) + return r +} + +// I really dislike having a function that does not terminate, so specify a +// really large upper bound for finding a new irreducible polynomial, and +// return an error when no irreducible polynomial has been found within +// randPolMaxTries. +const randPolMaxTries = 1e6 + +// RandomPolynomial returns a new random irreducible polynomial of degree 53 +// (largest prime number below 64-8). There are (2^53-2/53) irreducible +// polynomials of degree 53 in F_2[X], c.f. Michael O. Rabin (1981): +// "Fingerprinting by Random Polynomials", page 4. If no polynomial could be +// found in one million tries, an error is returned. +func RandomPolynomial() (Pol, error) { + for i := 0; i < randPolMaxTries; i++ { + var f Pol + + // choose polynomial at random + err := binary.Read(rand.Reader, binary.LittleEndian, &f) + if err != nil { + return 0, err + } + + // mask away bits above bit 53 + f &= Pol((1 << 54) - 1) + + // set highest and lowest bit so that the degree is 53 and the + // polynomial is not trivially reducible + f |= (1 << 53) | 1 + + // test if f is irreducible + if f.Irreducible() { + return f, nil + } + } + + // If this is reached, we haven't found an irreducible polynomial in + // randPolMaxTries. This error is very unlikely to occur. + return 0, errors.New("unable to find new random irreducible polynomial") +} + +// GCD computes the Greatest Common Divisor x and f. +func (x Pol) GCD(f Pol) Pol { + if f == 0 { + return x + } + + if x == 0 { + return f + } + + if x.Deg() < f.Deg() { + x, f = f, x + } + + return f.GCD(x.Mod(f)) +} + +// Irreducible returns true iff x is irreducible over F_2. This function +// uses Ben Or's reducibility test. +// +// For details see "Tests and Constructions of Irreducible Polynomials over +// Finite Fields". +func (x Pol) Irreducible() bool { + for i := 1; i <= x.Deg()/2; i++ { + if x.GCD(qp(uint(i), x)) != 1 { + return false + } + } + + return true +} + +// MulMod computes x*f mod g +func (x Pol) MulMod(f, g Pol) Pol { + if x == 0 || f == 0 { + return 0 + } + + var res Pol + for i := 0; i <= f.Deg(); i++ { + if (f & (1 << uint(i))) > 0 { + a := x + for j := 0; j < i; j++ { + a = a.Mul(2).Mod(g) + } + res = res.Add(a).Mod(g) + } + } + + return res +} + +// qp computes the polynomial (x^(2^p)-x) mod g. This is needed for the +// reducibility test. +func qp(p uint, g Pol) Pol { + num := (1 << p) + i := 1 + + // start with x + res := Pol(2) + + for i < num { + // repeatedly square res + res = res.MulMod(res, g) + i *= 2 + } + + // add x + return res.Add(2).Mod(g) +} diff --git a/chunker/polynomials_test.go b/chunker/polynomials_test.go new file mode 100644 index 000000000..411e299e3 --- /dev/null +++ b/chunker/polynomials_test.go @@ -0,0 +1,350 @@ +package chunker_test + +import ( + "strconv" + "testing" + + "github.com/restic/restic/chunker" +) + +var polAddTests = []struct { + x, y chunker.Pol + sum chunker.Pol +}{ + {23, 16, 23 ^ 16}, + {0x9a7e30d1e855e0a0, 0x670102a1f4bcd414, 0xfd7f32701ce934b4}, + {0x9a7e30d1e855e0a0, 0x9a7e30d1e855e0a0, 0}, +} + +func TestPolAdd(t *testing.T) { + for _, test := range polAddTests { + equals(t, test.sum, test.x.Add(test.y)) + equals(t, test.sum, test.y.Add(test.x)) + } +} + +func parseBin(s string) chunker.Pol { + i, err := strconv.ParseUint(s, 2, 64) + if err != nil { + panic(err) + } + + return chunker.Pol(i) +} + +var polMulTests = []struct { + x, y chunker.Pol + res chunker.Pol +}{ + {1, 2, 2}, + { + parseBin("1101"), + parseBin("10"), + parseBin("11010"), + }, + { + parseBin("1101"), + parseBin("11"), + parseBin("10111"), + }, + { + 0x40000000, + 0x40000000, + 0x1000000000000000, + }, + { + parseBin("1010"), + parseBin("100100"), + parseBin("101101000"), + }, + { + parseBin("100"), + parseBin("11"), + parseBin("1100"), + }, + { + parseBin("11"), + parseBin("110101"), + parseBin("1011111"), + }, + { + parseBin("10011"), + parseBin("110101"), + parseBin("1100001111"), + }, +} + +func TestPolMul(t *testing.T) { + for i, test := range polMulTests { + m := test.x.Mul(test.y) + assert(t, test.res == m, + "TestPolMul failed for test %d: %v * %v: want %v, got %v", + i, test.x, test.y, test.res, m) + m = test.y.Mul(test.x) + assert(t, test.res == test.y.Mul(test.x), + "TestPolMul failed for %d: %v * %v: want %v, got %v", + i, test.x, test.y, test.res, m) + } +} + +func TestPolMulOverflow(t *testing.T) { + defer func() { + // try to recover overflow error + err := recover() + + if e, ok := err.(string); ok && e == "multiplication would overflow uint64" { + return + } else { + t.Logf("invalid error raised: %v", err) + // re-raise error if not overflow + panic(err) + } + }() + + x := chunker.Pol(1 << 63) + x.Mul(2) + t.Fatal("overflow test did not panic") +} + +var polDivTests = []struct { + x, y chunker.Pol + res chunker.Pol +}{ + {10, 50, 0}, + {0, 1, 0}, + { + parseBin("101101000"), // 0x168 + parseBin("1010"), // 0xa + parseBin("100100"), // 0x24 + }, + {2, 2, 1}, + { + 0x8000000000000000, + 0x8000000000000000, + 1, + }, + { + parseBin("1100"), + parseBin("100"), + parseBin("11"), + }, + { + parseBin("1100001111"), + parseBin("10011"), + parseBin("110101"), + }, +} + +func TestPolDiv(t *testing.T) { + for i, test := range polDivTests { + m := test.x.Div(test.y) + assert(t, test.res == m, + "TestPolDiv failed for test %d: %v * %v: want %v, got %v", + i, test.x, test.y, test.res, m) + } +} + +var polModTests = []struct { + x, y chunker.Pol + res chunker.Pol +}{ + {10, 50, 10}, + {0, 1, 0}, + { + parseBin("101101001"), + parseBin("1010"), + parseBin("1"), + }, + {2, 2, 0}, + { + 0x8000000000000000, + 0x8000000000000000, + 0, + }, + { + parseBin("1100"), + parseBin("100"), + parseBin("0"), + }, + { + parseBin("1100001111"), + parseBin("10011"), + parseBin("0"), + }, +} + +func TestPolModt(t *testing.T) { + for _, test := range polModTests { + equals(t, test.res, test.x.Mod(test.y)) + } +} + +func BenchmarkPolDivMod(t *testing.B) { + f := chunker.Pol(0x2482734cacca49) + g := chunker.Pol(0x3af4b284899) + + for i := 0; i < t.N; i++ { + g.DivMod(f) + } +} + +func BenchmarkPolDeg(t *testing.B) { + f := chunker.Pol(0x3af4b284899) + d := f.Deg() + if d != 41 { + t.Fatalf("BenchmalPolDeg: Wrong degree %d returned, expected %d", + d, 41) + } + + for i := 0; i < t.N; i++ { + f.Deg() + } +} + +func TestRandomPolynomial(t *testing.T) { + _, err := chunker.RandomPolynomial() + ok(t, err) +} + +func BenchmarkRandomPolynomial(t *testing.B) { + for i := 0; i < t.N; i++ { + _, err := chunker.RandomPolynomial() + ok(t, err) + } +} + +func TestExpandPolynomial(t *testing.T) { + pol := chunker.Pol(0x3DA3358B4DC173) + s := pol.Expand() + equals(t, "x^53+x^52+x^51+x^50+x^48+x^47+x^45+x^41+x^40+x^37+x^36+x^34+x^32+x^31+x^27+x^25+x^24+x^22+x^19+x^18+x^16+x^15+x^14+x^8+x^6+x^5+x^4+x+1", s) +} + +var polIrredTests = []struct { + f chunker.Pol + irred bool +}{ + {0x38f1e565e288df, false}, + {0x3DA3358B4DC173, true}, + {0x30a8295b9d5c91, false}, + {0x255f4350b962cb, false}, + {0x267f776110a235, false}, + {0x2f4dae10d41227, false}, + {0x2482734cacca49, true}, + {0x312daf4b284899, false}, + {0x29dfb6553d01d1, false}, + {0x3548245eb26257, false}, + {0x3199e7ef4211b3, false}, + {0x362f39017dae8b, false}, + {0x200d57aa6fdacb, false}, + {0x35e0a4efa1d275, false}, + {0x2ced55b026577f, false}, + {0x260b012010893d, false}, + {0x2df29cbcd59e9d, false}, + {0x3f2ac7488bd429, false}, + {0x3e5cb1711669fb, false}, + {0x226d8de57a9959, false}, + {0x3c8de80aaf5835, false}, + {0x2026a59efb219b, false}, + {0x39dfa4d13fb231, false}, + {0x3143d0464b3299, false}, +} + +func TestPolIrreducible(t *testing.T) { + for _, test := range polIrredTests { + assert(t, test.f.Irreducible() == test.irred, + "Irreducibility test for Polynomial %v failed: got %v, wanted %v", + test.f, test.f.Irreducible(), test.irred) + } +} + +var polGCDTests = []struct { + f1 chunker.Pol + f2 chunker.Pol + gcd chunker.Pol +}{ + {10, 50, 2}, + {0, 1, 1}, + { + parseBin("101101001"), + parseBin("1010"), + parseBin("1"), + }, + {2, 2, 2}, + { + parseBin("1010"), + parseBin("11"), + parseBin("11"), + }, + { + 0x8000000000000000, + 0x8000000000000000, + 0x8000000000000000, + }, + { + parseBin("1100"), + parseBin("101"), + parseBin("11"), + }, + { + parseBin("1100001111"), + parseBin("10011"), + parseBin("10011"), + }, + { + 0x3DA3358B4DC173, + 0x3DA3358B4DC173, + 0x3DA3358B4DC173, + }, + { + 0x3DA3358B4DC173, + 0x230d2259defd, + 1, + }, + { + 0x230d2259defd, + 0x51b492b3eff2, + parseBin("10011"), + }, +} + +func TestPolGCD(t *testing.T) { + for i, test := range polGCDTests { + gcd := test.f1.GCD(test.f2) + assert(t, test.gcd == gcd, + "GCD test %d (%+v) failed: got %v, wanted %v", + i, test, gcd, test.gcd) + gcd = test.f2.GCD(test.f1) + assert(t, test.gcd == gcd, + "GCD test %d (%+v) failed: got %v, wanted %v", + i, test, gcd, test.gcd) + } +} + +var polMulModTests = []struct { + f1 chunker.Pol + f2 chunker.Pol + g chunker.Pol + mod chunker.Pol +}{ + { + 0x1230, + 0x230, + 0x55, + 0x22, + }, + { + 0x0eae8c07dbbb3026, + 0xd5d6db9de04771de, + 0xdd2bda3b77c9, + 0x425ae8595b7a, + }, +} + +func TestPolMulMod(t *testing.T) { + for i, test := range polMulModTests { + mod := test.f1.MulMod(test.f2, test.g) + assert(t, mod == test.mod, + "MulMod test %d (%+v) failed: got %v, wanted %v", + i, test, mod, test.mod) + } +} diff --git a/doc/test_irreducibility.gap b/doc/test_irreducibility.gap new file mode 100644 index 000000000..22b2ae3a4 --- /dev/null +++ b/doc/test_irreducibility.gap @@ -0,0 +1,25 @@ +# This file is a script for GAP and tests a list of polynomials in hexadecimal +# for irreducibility over F_2 + +# create x over F_2 = GF(2) +x := Indeterminate(GF(2), "x"); + +# test if polynomial is irreducible, i.e. the number of factors is one +IrredPoly := function (poly) + return (Length(Factors(poly)) = 1); +end;; + +# create a polynomial in x from the hexadecimal representation of the +# coefficients +Hex2Poly := function (s) + return ValuePol(CoefficientsQadic(IntHexString(s), 2), x); +end;; + +# list of candidates, in hex +candidates := [ "3DA3358B4DC173" ]; + +# create real polynomials +L := List(candidates, Hex2Poly); + +# filter and display the list of irreducible polynomials contained in L +Display(Filtered(L, x -> (IrredPoly(x))));