From 78a00ae3cefe7239bf33c202543e0b945d2877cd Mon Sep 17 00:00:00 2001
From: Mirek Kratochvil <exa.exa@gmail.com>
Date: Sun, 15 Nov 2015 17:11:49 +0100
Subject: [PATCH] mce_qcmdpc: implement faster decoding

This replaces the periodic recalculation of error correlations and the syndrome
by in-place modification. Bit flip is therefore a bit slower, but overall
decoding of the 256-bit secure variant fits in 200ms, and 128-bit variant
decodes under 20ms.

There still could be some (blatantly nondeterministic) method to do this using
FFT, research underway.
---
 src/algos_enc.cpp  | 29 ++++++++-------
 src/bvector.h      |  4 +++
 src/mce_qcmdpc.cpp | 90 +++++++++++++++++++++++++++++++---------------
 3 files changed, 83 insertions(+), 40 deletions(-)
diff --git a/src/algos_enc.cpp b/src/algos_enc.cpp
index 20d8fd6..602f9bc 100644
--- a/src/algos_enc.cpp
+++ b/src/algos_enc.cpp
@@ -351,6 +351,11 @@ static int fo_decrypt (const bvector&cipher, bvector&plain,
 	//decrypt the symmetric key
 	volatile bool failed = Priv.decrypt (mce_cipher, mce_plain, ev);
 
+	if (failed) { //prevent memory errors
+		ev.resize (ciphersize, 0);
+		mce_plain.resize (plainsize, 0);
+	}
+
 	/*
 	 * if decoding failed, ev contains something weird. We need to make it
 	 * to contain some dummy (but still valid) error vector that would work
@@ -419,21 +424,21 @@ int algo_mceqcmdpc##name::create_keypair (sencode**pub, sencode**priv, prng&rng)
 
 #if HAVE_CRYPTOPP==1
 
-mceqcmdpc_create_keypair_func (128, 9857, 2, 71, 134, 60, 5)
-mceqcmdpc_create_keypair_func (256, 32771, 2, 137, 264, 60, 8)
-mceqcmdpc_create_keypair_func (128cha, 9857, 2, 71, 134, 60, 5)
-mceqcmdpc_create_keypair_func (256cha, 32771, 2, 137, 264, 60, 8)
-mceqcmdpc_create_keypair_func (128xs, 9857, 2, 71, 134, 60, 5)
-mceqcmdpc_create_keypair_func (256xs, 32771, 2, 137, 264, 60, 8)
+mceqcmdpc_create_keypair_func (128, 9857, 2, 71, 134, 25, 4)
+mceqcmdpc_create_keypair_func (256, 32771, 2, 137, 264, 40, 4)
+mceqcmdpc_create_keypair_func (128cha, 9857, 2, 71, 134, 25, 4)
+mceqcmdpc_create_keypair_func (256cha, 32771, 2, 137, 264, 40, 4)
+mceqcmdpc_create_keypair_func (128xs, 9857, 2, 71, 134, 25, 4)
+mceqcmdpc_create_keypair_func (256xs, 32771, 2, 137, 264, 40, 4)
 
 #endif //HAVE_CRYPTOPP==1
 
-mceqcmdpc_create_keypair_func (128cube, 9857, 2, 71, 134, 60, 5)
-mceqcmdpc_create_keypair_func (256cube, 32771, 2, 137, 264, 60, 8)
-mceqcmdpc_create_keypair_func (128cubecha, 9857, 2, 71, 134, 60, 5)
-mceqcmdpc_create_keypair_func (256cubecha, 32771, 2, 137, 264, 60, 8)
-mceqcmdpc_create_keypair_func (128cubexs, 9857, 2, 71, 134, 60, 5)
-mceqcmdpc_create_keypair_func (256cubexs, 32771, 2, 137, 264, 60, 8)
+mceqcmdpc_create_keypair_func (128cube, 9857, 2, 71, 134, 25, 4)
+mceqcmdpc_create_keypair_func (256cube, 32771, 2, 137, 264, 40, 4)
+mceqcmdpc_create_keypair_func (128cubecha, 9857, 2, 71, 134, 25, 4)
+mceqcmdpc_create_keypair_func (256cubecha, 32771, 2, 137, 264, 40, 4)
+mceqcmdpc_create_keypair_func (128cubexs, 9857, 2, 71, 134, 25, 4)
+mceqcmdpc_create_keypair_func (256cubexs, 32771, 2, 137, 264, 40, 4)
 
 #define mceqcmdpc_create_encdec_func(name,bs,bc,errcount,hash_type,pad_hash_type,scipher,ranksize) \
 int algo_mceqcmdpc##name::encrypt (const bvector&plain, bvector&cipher, \
diff --git a/src/bvector.h b/src/bvector.h
index 4c50e05..bbc2f7c 100644
--- a/src/bvector.h
+++ b/src/bvector.h
@@ -165,6 +165,10 @@ public:
 		_data[blockof (i)] &= ~ ( ( (uint64_t) 1) << blockpos (i));
 	}
 
+	inline void flip (size_t i) {
+		_data[blockof (i)] = _data[blockof (i)] ^ ( ( (uint64_t) 1) << blockpos (i));
+	}
+
 	inline const_reference operator[] (size_t pos) const {
 		return const_reference (*this, pos);
 	}
diff --git a/src/mce_qcmdpc.cpp b/src/mce_qcmdpc.cpp
index 7473f56..8e41233 100644
--- a/src/mce_qcmdpc.cpp
+++ b/src/mce_qcmdpc.cpp
@@ -189,6 +189,7 @@ int privkey::decrypt (const bvector & in, bvector & out)
 }
 
 #include <vector>
+#include <list>
 
 int privkey::decrypt (const bvector & in_orig, bvector & out, bvector & errors)
 {
@@ -222,49 +223,82 @@ int privkey::decrypt (const bvector & in_orig, bvector & out, bvector & errors)
 	bvector (syndrome);
 	fft (synd_diag, syndrome);
 
+	//precompute sparse matrix indexes
+	vector<list<uint> > Hsp;
+	Hsp.resize (blocks);
+	for (i = 0; i < blocks; ++i)
+		for (j = 0; j < bs; ++j)
+			if (H[i][j])
+				Hsp[i].push_back (j);
+
+	/*
+	 * count the correlations, abuse the sparsity of matrices.
+	 *
+	 * TODO updating the counts and so is the slowest part of the whole
+	 * thing. It's all probabilistic, maybe there could be some potential
+	 * to speed it up by discarding some (already missing) precision.
+	 *
+	 * FFT would be a cool candidate.
+	 */
+
 	vector<unsigned> unsat;
 	unsat.resize (cs, 0);
 
-	for (i = 0; i < rounds; ++i) {
+	for (uint blk = 0; blk < blocks; ++blk)
+		for (uint i : Hsp[blk]) {
+			for (j = 0; j < bs; ++j)
+				if (syndrome[j])
+					++unsat[blk * bs + (j + bs - i) % bs];
+		}
 
-		/*
-		 * count the correlations, abuse the sparsity of matrices.
-		 *
-		 * TODO this is the slowest part of the whole thing. It's all
-		 * probabilistic, maybe there could be some potential to speed
-		 * it up by discarding some (already missing) precision.
-		 *
-		 * FFT would be a cool candidate.
-		 */
-
-		for (j = 0; j < cs; ++j) unsat[j] = 0;
-		for (uint Hi = 0; Hi < cs; ++Hi)
-			if (H[Hi / bs][Hi % bs]) {
-				uint blk = Hi / bs;
-				for (j = 0; j < bs; ++j)
-					if (syndrome[j])
-						++unsat[blk * bs +
-						        (j + cs - Hi) % bs];
-			}
+	uint round;
+	for (round = 0; round < rounds; ++round) {
 
 		uint max_unsat = 0;
-		for (j = 0; j < cs; ++j)
-			if (unsat[j] > max_unsat) max_unsat = unsat[j];
+		for (i = 0; i < cs; ++i)
+			if (unsat[i] > max_unsat) max_unsat = unsat[i];
 		if (!max_unsat) break;
+		if(max_unsat>bs) out("EROR?!!!");
 		//TODO do something about possible timing attacks
 
 		uint threshold = 0;
 		if (max_unsat > delta) threshold = max_unsat - delta;
 
-		//TODO also timing (but it gets pretty statistically hard here I guess)
-		for (uint bit = 0; bit < cs; ++bit)
-			if (unsat[bit] > threshold) {
-				in[bit] = !in[bit];
-				syndrome.rot_add (H[bit / bs], bit % bs);
+		for (uint bit = 0; bit < cs; ++bit) {
+			if (unsat[bit] <= threshold) continue;
+
+			/*
+			 * heavy trickery starts here, we carefully
+			 * modify the state to avoid necessity of
+			 * recomputation as a whole.
+			 */
+
+			uint blk = bit / bs, blkpos = bit % bs;
+
+			//adjust the error counts that were
+			//caused by this column of H
+			for (uint hpos : Hsp[blk]) {
+				hpos += blkpos;
+				//decide whether there's 1 or 0
+				bool increase = !syndrome[hpos % bs];
+				for (uint b2 = 0; b2 < blocks; ++b2)
+					for (uint h2 : Hsp[b2]) {
+						unsigned&ref = unsat[b2 * bs +
+						                     (hpos + bs - h2) % bs];
+						if (increase) ++ref;
+						else --ref;
+					}
+
+				//and flip it
+				syndrome.flip (hpos % bs);
 			}
+
+			//fix the bit
+			in.flip (bit);
+		}
 	}
 
-	if (i == rounds) return 2; //we simply failed
+	if (round == rounds) return 3; //we simply failed, haha.
 
 	errors = in_orig;
 	errors.add (in); //get the difference