/* > ecdl2-97.c
 * Purpose: Fast arithmetic for computing discrete logs on elliptic curves.
 * Copyright: Robert J. Harley, 1997-1999.
 * Contact: Robert.Harley@inria.fr
 * Legalese: This source code is subject to the GNU Public License (v2).
 */

/* This is ECDL64, version 1.2.1.
 *
 * Consult the following URL for more information:
 *   http://pauillac.inria.fr/~harley/ecdl6/
 */


/*** Quickstart ***/

/* Do this:

cc -O2 ecdl2-97.c -o ecdl
mkdir `uname -n`
cd `uname -n`
nice ../ecdl mail `whoami` MyTeam `uname -n` `uname -m` `uname -s` > log &

 */
#define STACK_SIZE 16384
unsigned long _stack[STACK_SIZE];
unsigned long *_sp = &_stack[STACK_SIZE-1];   /* stack address used by crtS.o */

int dummy;

/*** Detailed instructions ***/

/** 1. Compiling **
 *
 * Compile with something like:
 *   gcc ecdl2-97.c -o ecdl -O2 -freg-struct-return -mcpu=<chip>
 * or:
 *   cc ecdl2-97.c -o ecdl -O5 -non_shared -tune <chip>
 *
 * NB: <chip> is ev4, ev5 or ev6 for Alpha 2106x, 21164 or 21264 families
 *     respectively.
 *
 * You can test the binary like this:
 *   ./ecdl test me Testers bla bla bla
 *
 * The test output should match the sample below except for the
 * number of iterations per second.
 *
 * If the program complains that it cannot find sendmail, you can
 * add -DSENDMAIL='"/usr/sbin/sendmail"' or something similar.
 *
 * NB: Beware if you compile on Digital Unix with -non_shared and run
 *     on Linux.  The program uses popen() to run sendmail; when
 *     compiled on Digital Unix popen() looks for /sbin/sh whereas Linux
 *     has /bin/sh instead.  You will have to create an appropriate link
 *     or else use batch mode (see below).
 *
 * Test output follows:

ECDL64, version: 121.
Mode ...... test
User ...... me
Team ...... Testers
Machine ... bla
Hardware .. bla
OS ........ bla
Comment ... none
Generating 20 new starting points.
Computing iterations...
TEST|i|0000000078E0|u|0E9EB9537AE462CA6E5FF33A4|v|06ECAFCFFB357F2F7E13658E9|x|1DCF3AA4EAEF280DE65345A94|y|1F54C1B9D000000ECC9DB9A93|z|1|ECDL64|121|me|Testers|bla|bla|bla|none
*/

/*== #includes =============================================================*/

/** Ansi includes. **/

//#include <stdio.h>
//#include <string.h>

#include <stdarg.h>
#include <klibc.h>


/* For signal() and SIGTERM. */
//#include <signal.h>

/* For errno. */
//#include <errno.h>


/** Unix includes. **/

/* For getpid(), access() and timing. */
//#include <unistd.h>

/* For timing: getrusage() and struct rusage. */
//#include <sys/time.h>
//#include <sys/resource.h>


/*== Types =================================================================*/

/* Abbreviation used when bit-length does not matter. */
typedef unsigned long ulong;

/* Integers. */
typedef unsigned char u8;
typedef unsigned int u32;
typedef unsigned long u64;
typedef struct { u64 hi, lo; } u128;

/* Polynomials over the two-element field GF(2) = Z/2Z. */
typedef struct { u64 hi, lo; } poly128;

/* Operating modes. */
typedef enum { Bad, Test, Mail, Alt, Batch } modeType;


/*== #defines ==============================================================*/

/*-= Stuff that you can change =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/


/* Number of iterations to run "in parallel".  This is done so that
 * PARAL inversions can be replaced by one inversion and 3*PARAL-3 mults.
 * Try something roughly in the range 10 to 32.
 */
#ifndef PARAL
#define PARAL (10)
#endif


/* Define this to choose a slightly different implementation of the
 * critical GF2Product48x48() function.  Sometimes it is a bit faster,
 * sometimes a bit slower.
 */
/*#define ALT_PROD*/


/* Keyword for giving "inline" hint.
 * Adjust for your compiler or leave it out.
 */
#if defined(__GNUC__) || defined(__DECC)
#define INLINE __inline
#else
#define INLINE
#endif


/*-= Stuff that must not be changed =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-*/

/* Version 0.x.x: March 1998
 * Version 1.0.0: September 1998
 * - Does 340 k iterations per second with "gcc -O2" + Linux + 500 MHz Alpha.
 * - Tried Ari's sqrt(2) trick, took it out again (real speedup was too
 *   small to be worth the extra complexity).
 * Version 1.1.0: July 1999
 * - Various minor clean-ups.
 * Version 1.2.0: August 1999
 * - Changed saved state file to much better format.
 * Version 1.2.1: August 1999
 * - Cosmetic tweaks.
 * - Changed alternative email address to ecdl2-97@rupture.net.
 */
#define CLIENT "ECDL64"
#define VERSION "121"


/* These are used a few places for types poly128 and u128. */
#define ZERO { 0, 0 }
#define ONE { 0, 1 }


/* The number of cases used in pseudo-random function is 2^CASES_SHIFT.
 * One case is a doubling, the rest are additions of constant points.
 */
#define CASES_SHIFT (4)
#define ADDERS ((1UL<<CASES_SHIFT)-1)


/* Shifts used to decide if a point is distinguished. */
#define TEST_SHIFT (20)
#define DIST_SHIFT (30)


/*-= Data defining which ECDL problem to solve =-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/

/* Field is GF(2^97) = (Z/2Z)[t] / (t^97+t^6+1)
 *   (t^97+t^6+1 is irreducible in (Z/2Z)[t]).
 */

/* Curve from Certicom ECC2-97 problem is y^2 + x*y = x^3 + a*x^2 + b
 * over GF(2^97), where:
 * a = 151759946635783345666111607832
 * b =  46586505053717239594563606920
 *
 * Group order is G = 2 * 79228162514264464603828067969.
 * Use subgroup of order G/2 (prime).
 */
#define A       { 0x1EA5CE2B7, 0xF0A58E01B4389418 }
#define B       { 0x09687742B, 0x6329E70680231988 }
#define ORDER   { 0x100000000, 0x00007383E2DE1E81 }

/* Points from Certicom ECC2-97 problem:
 * XP = 46059744771359721362870909225
 * YP = 98068469436517043337972206208
 * XQ = 72978944960364674557961530030
 * YQ = 96375666913718821189526238111
 */
#define XP      { 0x094D3BA57, 0x4305888262363929 }
#define YP      { 0x13CE0562C, 0xC51EF416B2C0CA80 }
#define XQ      { 0x0EBCEC4B5, 0x961C75DF04EB6AAE }
#define YQ      { 0x13768154C, 0x2131B67B39A0339F }


/*== Function declarations =================================================*/

static INLINE int ge(u128 x, u128 y);
static INLINE u128 diff(u128 x, u128 y);
static INLINE u128 sumMod(u128 x, u128 y, u128 mod);
static INLINE u128 doubleMod(u128 x, u128 mod);
static INLINE u128 incMod(u128 x, u128 mod);
static INLINE int equal(poly128 x, poly128 y);
static INLINE poly128 xor(poly128 x, poly128 y);
static INLINE u64 topBit(u64 x);

static modeType parse(int argc, char *argv[]);
void catchSigTerm(int sigNum);
int main(void);

static poly128 square(poly128 x);
static u64 GF2Product48x48(u64 *ph, u64 x, u64 y);
static poly128 product(poly128 x, poly128 y);
static poly128 inverse(poly128 y);
static INLINE poly128 quotient(poly128 x, poly128 y);

static int ellipticDouble
  ( poly128 x, poly128 y, int z
  , poly128 *px2, poly128 *py2
  );
static int ellipticSum
  ( poly128 x1, poly128 y1, int z1, poly128 x2, poly128 y2, int z2
  , poly128 *px3, poly128 *py3
  );
static int ellipticProduct
  ( poly128 x, poly128 y, int z, u128 fac
  , poly128 *px2, poly128 *py2
  );

static void reportDistinguished
  ( modeType mode, int argc, int 
  , u64 iters, u128 u, u128 v, poly128 x, poly128 y, int z
  , u64 total, double dStart
  );
static void u64ToBytes(u64 data, u8 *p);
static u64 bytesToU64(u8 *p);
static void writeState
  ( u64 *itersT, u128 *uT, u128 *vT, poly128 *xT, poly128 *yT, int *zT
  );
static ulong readState
  ( u64 *itersT, u128 *uT, u128 *vT, poly128 *xT, poly128 *yT, int *zT
  );


/*== Global variables ======================================================*/

int gotSigTerm = 0;


/*== Function definitions ==================================================*/

/*-= Short little functions =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-*/

/*-- ge --------------------------------------------------------------------*/

/* Return whether x >= y, for 0 <= x, y < 2^128. */
static INLINE int ge(u128 x, u128 y) {

  return x.hi > y.hi || x.hi == y.hi && x.lo >= y.lo;
} /* end ge */


/*-- diff ------------------------------------------------------------------*/

/* Return x-y for 0 <= y <= x < 2^128. */
static INLINE u128 diff(u128 x, u128 y) {
  u64 c;

  c = x.lo < y.lo; x.hi -= y.hi; x.lo -= y.lo; x.hi -= c;

  return x;
} /* end diff */


/*-- sumMod ----------------------------------------------------------------*/

/* Return sum x+y modulo m, 0 <= x, y < m <= 2^127. */
static INLINE u128 sumMod(u128 x, u128 y, u128 m) {

  x.lo += y.lo; x.hi += y.hi; x.hi += x.lo < y.lo;
  if (ge(x, m)) x = diff(x, m);

  return x;
} /* end sumMod */


/*-- doubleMod -------------------------------------------------------------*/

/* Return double x<<1 modulo m, where 0 <= x < m <= 2^127. */
static INLINE u128 doubleMod(u128 x, u128 m) {

  x.hi <<= 1; x.hi |= x.lo>>63; x.lo <<= 1;
  if (ge(x, m)) x = diff(x, m);

  return x;
} /* end doubleMod */


/*-- incMod ----------------------------------------------------------------*/

/* Increment x modulo m, for 0 <= x < m < 2^128. */
static INLINE u128 incMod(u128 x, u128 m) {
  const u128 intZero = ZERO;

  ++x.lo; x.hi += !x.lo;
  if (!(x.hi ^ m.hi | x.lo ^ m.lo)) x = intZero;

  return x;
} /* end incMod */


/*-- equal -----------------------------------------------------------------*/

/* Return whether x == y, for polys over Z/2Z. */
static INLINE int equal(poly128 x, poly128 y) {

  return !(x.hi ^ y.hi | x.lo ^ y.lo);
} /* end equal */


/*-- xor -------------------------------------------------------------------*/

/* Return x XOR y, in other words add them as polys over Z/2Z. */
static INLINE poly128 xor(poly128 x, poly128 y) {

  x.hi ^= y.hi; x.lo ^= y.lo;

  return x;
} /* end xor */


/*-- topBit ----------------------------------------------------------------*/

/* Returns top bit of x, or 0 if x = 0. */
static INLINE u64 topBit(u64 x) {
  u64 t, y, z;

  z = x>>1;   t = x & 0xFFFFFFFF00000000;
  y = x | z;  if (t) x = t;
  z = y>>2;
  y |= z;     t = x & 0xFFFF0000FFFF0000;
  z = y>>4;   if (t) x = t;
  y |= z;
  z = y>>1;   t = x & 0xFF00FF00FF00FF00;
  if (t) x = t;

  return x & ~z;
} /* end topBit */


/*-- parse -----------------------------------------------------------------*/

/* Parse command line and return mode Bad, Test, Mail, Alt or Batch. */
static modeType parse(int argc, char *argv[]) {
  int i;
  char ch, *str;
  modeType mode;

/*
  if (argc != 7 && argc != 8) return Bad;

  str = argv[1];

  if (!strcmp(str, "test")) mode = Test;
  else if (!strcmp(str, "mail")) mode = Mail;
  else if (!strcmp(str, "alt")) mode = Alt;
  else if (!strcmp(str, "batch")) mode = Batch;
  else return Bad;

  for (i = 2; i < argc; ++i) {
    for (str = argv[i]; (ch = *str) != '\0'; ++str) {
      if (ch == '|') return Bad;
    }
  } 
*/

  mode = Batch;

  return mode;
} /* end parse */


/*-- catchSigTerm ----------------------------------------------------------*/

/* Signal catcher for SIGTERM.
 * Allows program to stop cleanly and save state.
 * User can send the signal by "kill -TERM <pid>", for instance.
 */
void catchSigTerm(int sigNum) {

  gotSigTerm = 1;

} /* end catchSigTerm */


/*-- main ------------------------------------------------------------------*/

int main(void) {
  /* Constant stuff. */
  const int zP = 1, zQ = 1;
  const u128 order = ORDER;
  const poly128 xP = XP, yP = YP, xQ = XQ, yQ = YQ;
  const poly128 zero = ZERO, one = ONE;

  /* These get initialised once and for all. */
  int distShift;
  modeType mode;

  /* These are variable. */
  u64 total;
  double dStart; /* For timing. */

  int zA[ADDERS];
  u128 uA[ADDERS];
  poly128 xA[ADDERS], yA[ADDERS];

  int zT[PARAL];
  u64 itersT[PARAL];
  u128 uT[PARAL], vT[PARAL];
  poly128 denT[PARAL], xT[PARAL], yT[PARAL];


  /** Simple sanity check. **/

  if (sizeof(u32) != 4 || sizeof(u64) != 8) {
    kprintf("Fatal error: *** the size of u32 or u64 is wrong.\n");
    return 1;
  } /* end if */


  /** See if we can find sendmail with execute permission. **/
/*
  if (access(SENDMAIL, X_OK)) {
    printf( "Warning: sendmail program " SENDMAIL " does not exist or is not "
              "executable\n"
            "         (errno = %d: %s).  Continuing anyway...\n"
            , errno, strerror(errno)
            );
    fflush(stdout);
  }
*/

  /** Parse command line. **/
/*
  mode = parse(argc, argv);
  if (mode == Bad) {
    printf( "Syntax: %s <mode> <user> <team> <machine> <hardware> <OS> "
              "{ <comment> }\n"
            "See top of source code or readMe file for details.  Stopping.\n"
          , argv[0]
          );
    return 1;
  }
*/

  mode = Test;

  /** Put up banner. **/
/*
  kprintf( CLIENT ", version: " VERSION ".\n"
          "Mode ...... %s\n"
          "User ...... %s\n"
	   , 
  kprintf("Team ...... %s\n"
          "Machine ... %s\n", "SchlagTeam", "disy" );
          "Hardware .. %s\n"
          "OS ........ %s\n"
          "Comment ... %s\n"
        , "batch", "switch", 

"MIPS4700, "L4/MIPS",
           "UNSW, Disy Group"
        );

  fflush(stdout);
*/

  /** Initialisation. **/

  /* Pseudo-random points to add.  Do not change. */
  { ulong i;
    u32 startu;
    u32 dummy;

    /* Points have same large prime order as P since 0 < fac < order. */
    startu = 1;
    for (i = 0; i < ADDERS; ++i) {
      u128 fac;
      const u32 pi = 3141592653U, e = 2718281829U; /* Rounded to odd. */

      /* NB: not terribly random, but it will do! */
      startu = startu*pi+e; fac.lo = startu;
      startu = startu*pi+e; fac.lo |= (u64)startu<<32;
      startu = startu*pi+e; fac.hi = startu;
      uA[i] = fac;
      zA[i] = ellipticProduct(xP, yP, zP, fac, xA+i, yA+i);
    } /* end for */

  } /* end block */

  /* Read starting points from the saved state or generate new random ones. */
  { ulong i;

    if (mode == Test) i = 0;
    else i = readState(itersT, uT, vT, xT, yT, zT);

    if (i < PARAL) {
      u128 seed;

      kprintf("Generating %lu new starting points.\n", PARAL-i);
/*      fflush(stdout); */

      if (mode == Test) {
        /* Fixed seed for testing. */
        seed.hi = 0;
        seed.lo = 0x0123456789ABCDEF;
      } else {
	  /* FILE *handle;*/

        /* Get really random seed. */
/*        handle = fopen("/dev/urandom", "r"); */
/*        if (handle) { 
          fread((void *)&seed, sizeof(seed), (size_t)1, handle);
          fclose(handle);
	  } */
        seed.hi ^= 4882;
        seed.hi &= 0x00000000FFFFFFFF;
        seed.lo ^= 0x1e10;
      } /* end if/else */

      /* Generate random points. */
      for (; i < PARAL; ++i) {
        const u128 intZero = ZERO;
        /* Euler gamma times 2^96. */
        const u128 gamma = { 0x093C467E3, 0x7DB0C7A4D1BE3F81 };

        itersT[i] = 0;
        uT[i] = intZero; vT[i] = seed;
        zT[i] = ellipticProduct(xQ, yQ, zQ, seed, xT+i, yT+i);

        seed.lo += gamma.lo;
        seed.hi += seed.lo < gamma.lo; seed.hi += gamma.hi;
        seed.hi &= 0x00000000FFFFFFFF;
      } /* end for (i) */
    } /* end if */
  } /* end block */


  /* Timing (measured in seconds of user time for this process). */
/*
  { struct rusage ru;

    getrusage(RUSAGE_SELF, &ru);
    dStart = (double)ru.ru_utime.tv_sec+(double)ru.ru_utime.tv_usec*0.000001;
    } */


  kprintf("Computing iterations...\n");
/*  fflush(stdout); */

  distShift = 64-(mode == Test ? TEST_SHIFT : DIST_SHIFT);

/*  signal(SIGTERM, catchSigTerm); */

  for (total = 0; !gotSigTerm; total += PARAL) {
    ulong i;

    /** Check for distinguished points. **/
    for (i = 0; i < PARAL; ++i) {
      while (yT[i].lo>>distShift == 0) { /* Point is distinguished. */

        /* Check that everything is OK i.e., point is equal to [u]P+[v]Q. */
        int z, zu, zv;
        poly128 x, xu, xv, y, yu, yv;

        zu = ellipticProduct(xP, yP, zP, uT[i], &xu, &yu);
        zv = ellipticProduct(xQ, yQ, zQ, vT[i], &xv, &yv);
        z = ellipticSum(xu, yu, zu, xv, yv, zv, &x, &y);

        if (!equal(x, xT[i]) || !equal(y, yT[i]) || z != zT[i]) {
          kprintf("Fatal error: *** bad point detected! ***\n");
/*          fflush(stdout); */
          return 2;
        } /* end if */

        reportDistinguished( mode, dummy, dummy,
                            itersT[i], uT[i], vT[i], xT[i], yT[i], zT[i]
                           , total, dStart
                           );

        /* Restart this point by incrementing v. */
        itersT[i] = 0;
        vT[i] = incMod(vT[i], order);
        zT[i] = ellipticSum(xT[i], yT[i], zT[i], xQ, yQ, zQ, xT+i, yT+i);

        /** Save state (except when testing). **/
        if (mode != Test) writeState(itersT, uT, vT, xT, yT, zT);
      } /* end while (distinguished point) */
    } /* end for (i) */


    /** Get denominators for additions and doublings of points. **/
    for (i = 0; i < PARAL; ++i) {
      ulong m;
      poly128 den, x1;

      x1 = xT[i];
      m = x1.lo>>(64-CASES_SHIFT);
      if (m < ADDERS) { /* Addition of constant point. */
        den = xA[m];
        if (!equal(den, x1)) den = xor(den, x1);
        uT[i] = sumMod(uT[i], uA[m], order);
      } else { /* Doubling. */
        den = x1;
        uT[i] = doubleMod(uT[i], order);
        vT[i] = doubleMod(vT[i], order);
      } /* end if/else */

      if (equal(den, zero)) den = one; /* Dummy to avoid division by 0. */
      denT[i] = den;
    } /* end for (i) */


    /** Invert PARAL denominators with 1 inversion and 3*PARAL-3 mults. **/
    { poly128 ix, prod, q, prodT[PARAL];

      prod = denT[0];
      for (i = 1; i < PARAL; ++i) {
        prodT[i] = prod;
        prod = product(prod, denT[i]);
      } /* end for */

      q = inverse(prod);

      for (i = PARAL; --i; ) {
        ix = product(q, prodT[i]);
        q = product(q, denT[i]);
        denT[i] = ix;
      } /* end for */
      denT[0] = q;

    } /* end block */


    /** Get new points. **/
    for (i = 0; i < PARAL; ++i) {
      int z1, z2, nz;
      ulong m;
      poly128 nx,ny, x1,y1, x2,y2;
      const poly128 a = A;

      /* Get points to add, (x1:y1:z1) and (x2:y2:z2). */
      x1 = xT[i]; y1 = yT[i]; z1 = zT[i];

      m = x1.lo>>(64-CASES_SHIFT);
      if (m < ADDERS) { x2 = xA[m]; y2 = yA[m]; z2 = zA[m]; }
      else { x2 = x1; y2 = y1; z2 = z1; }


      /* Get their sum as a new point P = (nx:ny:nz) using denT[] table. */
      if (!z1) { nx = x2; ny = y2; nz = z2; }
      else if (!z2) { nx = x1; ny = y1; nz = z1; }
      else if (equal(x1, x2)) {
        if (equal(y1, y2)) { /* Doubling. */
          poly128 lam = xor(product(y1, denT[i]), x1);

          nx = xor(xor(square(lam), lam), a);
          ny = xor(xor(product(lam, xor(nx, x1)), nx), y1);
          nz = 1;
        } else { nx = zero; ny = one; nz = 0; }
      } else { /* General case. */
        poly128 lam = product(xor(y1, y2), denT[i]);
        poly128 t = xor(xor(xor(square(lam), lam), x2), a);

        nx = xor(t, x1);
        ny = xor(xor(product(lam, t), nx), y1);
        nz = 1;
      } /* end if/else if/else if/else */

      xT[i] = nx; yT[i] = ny; zT[i] = nz;
      ++itersT[i];

    } /* end for (i) */

  } /* end main loop */

  kprintf("Received SIGTERM.");


  /** Save state after receiving SIGTERM (except when testing). **/
  if (mode != Test) writeState(itersT, uT, vT, xT, yT, zT);


  kprintf("Bye!");
  return 0;
} /* end main */


/*-= Arithmetic in the field GF(2^97) =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-*/

/*-- square ----------------------------------------------------------------*/

/* Square x in the field GF(2^97) i.e., as a poly in (Z/2Z)[t]
 * reduced modulo t^97+t^6+1.  Degree x < 97.
 */
static poly128 square(poly128 x) {
  static const int tab[256] =
  {     0,     1,     4,     5,    16,    17,    20,    21
  ,    64,    65,    68,    69,    80,    81,    84,    85
  ,   256,   257,   260,   261,   272,   273,   276,   277
  ,   320,   321,   324,   325,   336,   337,   340,   341
  ,  1024,  1025,  1028,  1029,  1040,  1041,  1044,  1045
  ,  1088,  1089,  1092,  1093,  1104,  1105,  1108,  1109
  ,  1280,  1281,  1284,  1285,  1296,  1297,  1300,  1301
  ,  1344,  1345,  1348,  1349,  1360,  1361,  1364,  1365
  ,  4096,  4097,  4100,  4101,  4112,  4113,  4116,  4117
  ,  4160,  4161,  4164,  4165,  4176,  4177,  4180,  4181
  ,  4352,  4353,  4356,  4357,  4368,  4369,  4372,  4373
  ,  4416,  4417,  4420,  4421,  4432,  4433,  4436,  4437
  ,  5120,  5121,  5124,  5125,  5136,  5137,  5140,  5141
  ,  5184,  5185,  5188,  5189,  5200,  5201,  5204,  5205
  ,  5376,  5377,  5380,  5381,  5392,  5393,  5396,  5397
  ,  5440,  5441,  5444,  5445,  5456,  5457,  5460,  5461
  , 16384, 16385, 16388, 16389, 16400, 16401, 16404, 16405
  , 16448, 16449, 16452, 16453, 16464, 16465, 16468, 16469
  , 16640, 16641, 16644, 16645, 16656, 16657, 16660, 16661
  , 16704, 16705, 16708, 16709, 16720, 16721, 16724, 16725
  , 17408, 17409, 17412, 17413, 17424, 17425, 17428, 17429
  , 17472, 17473, 17476, 17477, 17488, 17489, 17492, 17493
  , 17664, 17665, 17668, 17669, 17680, 17681, 17684, 17685
  , 17728, 17729, 17732, 17733, 17744, 17745, 17748, 17749
  , 20480, 20481, 20484, 20485, 20496, 20497, 20500, 20501
  , 20544, 20545, 20548, 20549, 20560, 20561, 20564, 20565
  , 20736, 20737, 20740, 20741, 20752, 20753, 20756, 20757
  , 20800, 20801, 20804, 20805, 20816, 20817, 20820, 20821
  , 21504, 21505, 21508, 21509, 21520, 21521, 21524, 21525
  , 21568, 21569, 21572, 21573, 21584, 21585, 21588, 21589
  , 21760, 21761, 21764, 21765, 21776, 21777, 21780, 21781
  , 21824, 21825, 21828, 21829, 21840, 21841, 21844, 21845
  };

  u64 t0,t1,t2, tmp, xh,xl;
  poly128 r;

  xh = x.hi; xl = x.lo;

  t0 = (u64)tab[xl & 255] | (u64)tab[xl>>8 & 255]<<16
       | (u64)tab[xl>>16 & 255]<<32 | (u64)tab[xl>>24 & 255]<<48
  ;
  t1 = (u64)tab[xl>>32 & 255] | (u64)tab[xl>>40 & 255]<<16
       | (u64)tab[xl>>48 & 255]<<32 | (u64)tab[xl>>56]<<48
  ;
  t2 = (u64)tab[xh & 255] | (u64)tab[xh>>8 & 255]<<16
       | (u64)tab[xh>>16 & 255]<<32 | (u64)tab[xh>>24 & 255]<<48
  ;

  /* Handle 97th bit. */
  tmp = xh & 0xFFFFFFFF00000000; t1 ^= tmp<<5; t1 ^= tmp>>1;

  /* Reduce modulo t^97+t^6+1. */
  t0 ^= t2<<31; t1 ^= t2>>33;
  t0 ^= t2<<37; t1 ^= t2>>27;

  tmp = t1>>33; t1 ^= tmp<<33;
  t0 ^= tmp; t0 ^= tmp<<6;

  r.hi = t1; r.lo = t0;
  return r;
} /* end square */


/*-- GF2Product48x48 -------------------------------------------------------*/

/* Multiply y by low 48 bits of x, as polys over Z/2Z, degree y < 48.
 * Speed-critical auxiliary function used for product().
 * Returns low 64 bits of result, puts high 31 bits in *ph.
 * Hi NSA dudes! -- Rob.
 */
static u64 GF2Product48x48(u64 *ph, u64 x, u64 y) {
  u64 a,b,c, e, tab[16], y1,y2,y3;

  y1 = y<<1; y2 = y<<2; y3 = y<<3;

  /* Gray code walk through table. */
#ifdef ALT_PROD
  e = 0;   tab[0] = 0;
  e ^= y;  tab[1] = y;
  e ^= y1; tab[3] = e;
  e ^= y;  tab[2] = y1;
  e ^= y2; tab[6] = e;
  e ^= y;  tab[7] = e;
  e ^= y1; tab[5] = e;
  e ^= y;  tab[4] = y2;
  e ^= y3; tab[12] = e;
  e ^= y;  tab[13] = e;
  e ^= y1; tab[15] = e;
  e ^= y;  tab[14] = e;
  e ^= y2; tab[10] = e;
  e ^= y;  tab[11] = e;
  e ^= y1; tab[9] = e;
  e ^= y;  tab[8] = y3;
#else
  e = 0;   tab[0] = e; tab[8] = e ^ y3;
  e ^= y;  tab[1] = e; tab[9] = e ^ y3;
  e ^= y1; tab[3] = e; tab[11] = e ^ y3;
  e ^= y;  tab[2] = e; tab[10] = e ^ y3;
  e ^= y2; tab[6] = e; tab[14] = e ^ y3;
  e ^= y;  tab[7] = e; tab[15] = e ^ y3;
  e ^= y1; tab[5] = e; tab[13] = e ^ y3;
  e ^= y;  tab[4] = e; tab[12] = e ^ y3;
#endif

  a = tab[x & 15];
  a ^= tab[x>>4 & 15]<<4;
  a ^= tab[x>>8 & 15]<<8;
  a ^= tab[x>>12 & 15]<<12;

  b = tab[x>>16 & 15];
  b ^= tab[x>>20 & 15]<<4;
  b ^= tab[x>>24 & 15]<<8;
  b ^= tab[x>>28 & 15]<<12;

  c = tab[x>>32 & 15];
  c ^= tab[x>>36 & 15]<<4;
  c ^= tab[x>>40 & 15]<<8;
  c ^= tab[x>>44 & 15]<<12;

  *ph = b>>48 ^ c>>32;
  return a ^ b<<16 ^ c<<32;
} /* end GF2Product48x48 */


/*-- product ---------------------------------------------------------------*/

/* Product of x and y in the field GF(2^97) i.e., as polys in (Z/2Z)[t]
 * reduced modulo t^97+t^6+1.  Degree x, y < 97.
 */
static poly128 product(poly128 x, poly128 y) {
  u64 lh,ll, mh,ml, hl,hh, tmp, t0,t1,t2, xb,yb, xh, yh;
  poly128 r;

  xh = x.hi<<16 | x.lo>>48; yh = y.hi<<16 | y.lo>>48;
  ll = GF2Product48x48(&lh, x.lo, y.lo & 0x0000FFFFFFFFFFFF);
  ml = GF2Product48x48(&mh, xh ^ x.lo, (yh ^ y.lo) & 0x0000FFFFFFFFFFFF);
  hl = GF2Product48x48(&hh, xh, yh & 0x0000FFFFFFFFFFFF);

  ml ^= ll; mh ^= lh;
  ml ^= hl; mh ^= hh;

  t0 = ll; t1 = lh | hl<<32; t2 = hl>>32 | hh<<32;
  t0 ^= ml<<48; t1 ^= ml>>16; t1 ^= mh<<48; t2 ^= mh>>16;

  /* Handle 97th bit. */
  xb = x.hi & 0xFFFFFFFF00000000; yb = y.hi & 0xFFFFFFFF00000000;
  if (xb) { t1 ^= y.lo<<32; t2 ^= y.lo>>32; t2 ^= y.hi<<32; }
  if (yb) { t1 ^= x.lo<<32; t2 ^= x.lo>>32; t2 ^= x.hi<<32; }
  tmp = xb & yb; t1 ^= tmp<<5; t1 ^= tmp>>1;

  /* Reduce modulo t^97+t^6+1. */
  t0 ^= t2<<31; t1 ^= t2>>33;
  t0 ^= t2<<37; t1 ^= t2>>27;

  tmp = t1>>33; t1 ^= tmp<<33;
  t0 ^= tmp; t0 ^= tmp<<6;

  r.hi = t1; r.lo = t0;
  return r;
} /* end product */


/*-- inverse ---------------------------------------------------------------*/

/* Invert y in the field GF(2^97) i.e., as a poly in (Z/2Z)[t]
 * reduced modulo t^97+t^6+1.  Degree y < 97, y != 0.
 */
static poly128 inverse(poly128 y) {
  u64 ah,al, bh,bl, sh,th, t, uh,ul, vh,vl;
  poly128 r;
  static const int tab[64] =
  { 6,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0
  , 5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0
  };
  const u64 mh = 1UL<<33, ml = 65;


  /* Maintain: u.y = a and v.y = b modulo t^97+t^6+1. */

  ah = y.hi; al = y.lo;
  uh = 0; ul = 1;

  while (!(al & 1)) {
    t = ul & 1; ul ^= t<<6; uh ^= t<<33;
    al = al>>1 | ah<<63; ah >>= 1;
    ul = ul>>1 | uh<<63; uh >>= 1;
  } /* end while */

  bh = mh; bl = ml;
  vh = 0; vl = 0;

  do {
    do {

      do {
        bh ^= ah; bl ^= al;
        vh ^= uh; vl ^= ul;

        sh = tab[bl & 63]; th = 64-sh;
        t = vl & 63; vl ^= t; vl ^= t<<6; vh ^= t<<33;
        bl = bl>>sh | bh<<th; bh >>= sh;
        vl = vl>>sh | vh<<th; vh >>= sh;
        while (!(bl & 1)) {
          t = vl & 1; vl ^= t<<6; vh ^= t<<33;
          bl = bl>>1 | bh<<63; bh >>= 1;
          vl = vl>>1 | vh<<63; vh >>= 1;
        } /* end while */
      } while (ah < bh || ah == bh && al < bl);

      if (al == bl && ah == bh) break;

      do {
        ah ^= bh; al ^= bl;
        uh ^= vh; ul ^= vl;

        sh = tab[al & 63]; th = 64-sh;
        t = ul & 63; ul ^= t; ul ^= t<<6; uh ^= t<<33;
        al = al>>sh | ah<<th; ah >>= sh;
        ul = ul>>sh | uh<<th; uh >>= sh;
        while (!(al & 1)) {
          t = ul & 1; ul ^= t<<6; uh ^= t<<33;
          al = al>>1 | ah<<63; ah >>= 1;
          ul = ul>>1 | uh<<63; uh >>= 1;
        } /* end while */
      } while (ah > bh || ah == bh && al > bl);

    } while (al != bl);
  } while (ah != bh);

  /* Now a (and b) must equal 1. */

  /* Reduce u modulo t^97+t^6+1. */
  t = uh>>33; ul ^= t; ul ^= t<<6; uh ^= t<<33;

  r.hi = uh; r.lo = ul;
  return r;
} /* end inverse */


/*-- quotient --------------------------------------------------------------*/

/* Divide x by y in the field GF(2^97) i.e., as polys in (Z/2Z)[t]
 * reduced modulo t^97+t^6+1.  Degree x, y < 97, y != 0.
 */
static INLINE poly128 quotient(poly128 x, poly128 y) {

  return product(x, inverse(y));
} /* end quotient */


/*-= Arithmetic on elliptic curve over the field =-=-=-=-=-=-=-=-=-=-=-=-=-=*/

/*-- ellipticDouble --------------------------------------------------------*/

/* Given a point (x:y:z) on y^2 + x*y = x^3 + a*x^2 + b, this computes
 * its double (x2:y2:z2) by the group law.
 * It puts x2 in *px2, y2 in *py2 and returns z2.
 * The point at infinity is represented by (0:1:0).
 * Finite points are represented by (x:y:1).
 */
static int ellipticDouble
  ( poly128 x, poly128 y, int z
  , poly128 *px2, poly128 *py2
  ) {
  poly128 lam, x2,y2;
  const poly128 a = A, one = ONE, zero = ZERO;

  if (!z || equal(x, zero)) { *px2 = zero; *py2 = one; return 0; }

  lam = xor(x, quotient(y, x));
  x2 = xor(xor(square(lam), lam), a);
  y2 = xor(xor(product(lam, xor(x, x2)), x2), y);

  *px2 = x2; *py2 = y2; return 1;
} /* end ellipticDouble */


/*-- ellipticSum -----------------------------------------------------------*/

/* Given points (x1:y1:z1) and (x2:y2:z2) on y^2 + x*y = x^3 + a*x^2 + b,
 * this computes their sum (x3:y3:z3) by the group law.
 * It puts x3 in *px3, y3 in *py3 and returns z3.
 * The point at infinity is represented by (0:1:0).
 * Finite points are represented by (x:y:1).
 */
static int ellipticSum
  ( poly128 x1, poly128 y1, int z1, poly128 x2, poly128 y2, int z2
  , poly128 *px3, poly128 *py3
  ) {
  poly128 lam, t, x3,y3;
  const poly128 a = A;

  if (!z1) { *px3 = x2; *py3 = y2; return z2; }
  if (!z2) { *px3 = x1; *py3 = y1; return z1; }

  if (equal(x1, x2)) {
    const poly128 zero = ZERO, one = ONE;

    if (equal(y1, y2)) return ellipticDouble(x1, y1, z1, px3, py3);
    *px3 = zero; *py3 = one; return 0;
  } /* end if */

  lam = quotient(xor(y1, y2), xor(x1, x2));
  t = xor(xor(xor(square(lam), lam), x2), a);
  x3 = xor(t, x1);
  y3 = xor(xor(product(lam, t), x3), y1);

  *px3 = x3; *py3 = y3; return 1;
} /* end ellipticSum */


/*-- ellipticProduct -------------------------------------------------------*/

/* Given a point (x:y:z) on y^2 + x*y = x^3 + a*x^2 + b, this computes
 * its fac-th multiple (x2:y2:z2) by the group law.
 * It puts x2 in *px2, y2 in *py2 and returns z2.
 * The point at infinity is represented by (0:1:0).
 * Finite points are represented by (x:y:1).
 */
static int ellipticProduct
  ( poly128 x, poly128 y, int z, u128 fac
  , poly128 *px2, poly128 *py2
  ) {
  int z2;
  u64 f, m;
  poly128 x2,y2;

  /* Get most significant 64-bit half of fac. */
  f = fac.hi;
  if (!f) f = fac.lo;

  /* Quick exit if fac was 0. */
  if (!f) {
    const poly128 zero = ZERO, one = ONE;

    *px2 = zero; *py2 = one; return 0;
  } /* end if */

  /* Run through bits of most significant half. */
  m = topBit(f);
  x2 = x; y2 = y; z2 = z;
  while (m >>= 1) {
    z2 = ellipticDouble(x2, y2, z2, &x2, &y2);
    if (f & m) z2 = ellipticSum(x, y, z, x2, y2, z2, &x2, &y2);
  } /* end while */

  /* Run through bits of least significant half, if there is one. */
  if (fac.hi) {
    m = 1UL<<63;
    f = fac.lo;
    do {
      z2 = ellipticDouble(x2, y2, z2, &x2, &y2);
      if (f & m) z2 = ellipticSum(x, y, z, x2, y2, z2, &x2, &y2);
      m >>= 1;
    } while (m);
  } /* end if */

  *px2 = x2; *py2 = y2; return z2;
} /* end ellipticProduct */


/*-= Various file manipulation thingies =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-*/

/*-- reportDistinguished ---------------------------------------------------*/

/* Reporting of distinguished points back to headquarters. */
static void reportDistinguished
  ( modeType mode, int argc, int csd
  , u64 iters, u128 u, u128 v
  , poly128 x, poly128 y, int z
  , u64 total, double dStart
  ) {
  char *identity;
  double dRate, dUserTime; /* For timing. */
  const char format[] =
      "%s|i|%012lX|u|%09lX%016lX|v|%09lX%016lX|"
      "x|%09lX%016lX|y|%09lX%016lX|z|%X|"
      CLIENT "|" VERSION "|%s|%s|%s|%s|%s|%s\n"
  ;

const char format1[] =
      "%s|i|%012lx|u|%09lx%016lx|";

const char format2[] = "v|%09lx%016lx|";
const char format3[] = "x|%09lx%016lx|y|%09lx%016lx|z|%x|";
const char format4[] = CLIENT "|" VERSION "|%s|%s|%s|%s";
const char format5[] = "|%s|%s\n";

  identity = mode == Test ? "TEST" : "ECC2-97";

  kprintf(format1, identity
        , iters, u.hi, u.lo );

  kprintf(format2, v.hi, v.lo);
  kprintf(format3, x.hi, x.lo, y.hi, y.lo, z);
  kprintf(format4, "danielp", "SchlagTeam", "disy", "21164A" );
  kprintf(format5, "L4/Alpha", "DisyGroup");

/*  kprintf(format, identity
        , iters, u.hi, u.lo, v.hi, v.lo, x.hi, x.lo, y.hi, y.lo, z
	  , "danielp", "SchlagTeam", "disy", "21164A", "L4/Alpha", "DisyGroup");
*/
/*
  kprintf( format
        , identity
        , iters, u.hi, u.lo, v.hi, v.lo, x.hi, x.lo, y.hi, y.lo, z
        , argv[2], argv[3], argv[4], argv[5], argv[6]
        , argc == 8 ? argv[7] : "none"
        );
*/

  /* Timing. */
/*  { double dNow;
    struct rusage ru;

    getrusage(RUSAGE_SELF, &ru);
    dNow = (double)ru.ru_utime.tv_sec+(double)ru.ru_utime.tv_usec*0.000001;
    dUserTime = dNow-dStart;
    } *//* end block */

/*  if (dUserTime) {
    dRate = (double)total/dUserTime;
    printf( "Iterations during this run = %lu at %g per second.\n"
          , total, dRate
          );
	  } */
/* end if */
/*
  if (mode == Mail || mode == Alt) {
    FILE *handle;

    handle = popen(SENDMAIL " -t", "w");
    if (handle == NULL) {
      puts("Warning: couldn't pipe to " SENDMAIL ", send by hand!");
    } else {
      int status;

      fprintf( handle
             , "To: %s\n"
             , mode == Mail
               ? "ecdl2-97@pauillac.inria.fr"
               : "ecdl2-97@rupture.net"
             );
      fprintf( handle
             , format
             , identity
             , iters, u.hi, u.lo, v.hi, v.lo, x.hi, x.lo, y.hi, y.lo, z
             , argv[2], argv[3], argv[4], argv[5], argv[6]
             , argc == 8 ? argv[7] : "none"
             );


      if (dUserTime) {
        fprintf( handle
               , "Iterations during this run = %lu at %g per second.\n"
               , total, dRate
               );
      } 

      status = fflush(handle);
      if (status == EOF) {
        printf( "Warning: fflush() of pipe failed\n"
                "         (errno = %d: %s).\n"
              , errno, strerror(errno)
              );
      } 

      status = pclose(handle);
      if (status) {
        if (status == -1) {
          printf( "Warning: pclose() failed\n"
                  "         (errno = %d: %s).\n"
                , errno, strerror(errno)
                );
        } else printf("Warning: " SENDMAIL " returned status %d.\n", status);
      } 

    } 
  } 
*/

/*  fflush(stdout); */

} /* end reportDistinguished */


/*-- u64ToBytes ------------------------------------------------------------*/

static void u64ToBytes(u64 data, u8 *p) {

  p[0] = data;
  p[1] = data>>8;
  p[2] = data>>16;
  p[3] = data>>24;
  p[4] = data>>32;
  p[5] = data>>40;
  p[6] = data>>48;
  p[7] = data>>56;

} /* end u64ToBytes */


/*-- bytesToU64 ------------------------------------------------------------*/

static u64 bytesToU64(u8 *p) {

  return p[0] | (u64)p[1]<<8 | (u64)p[2]<<16 | (u64)p[3]<<24
         | (u64)p[4]<<32 | (u64)p[5]<<40 | (u64)p[6]<<48 | (u64)p[7]<<56
  ;
} /* end bytesToU64 */


/*-- writeState ------------------------------------------------------------*/

/* Saves state to file in a portable binary format. */
static void writeState
  ( u64 *itersT, u128 *uT, u128 *vT, poly128 *xT, poly128 *yT, int *zT
  ) {

} /* end writeState */


/*-- readState -------------------------------------------------------------*/

/* Reads state back from file (if it exists) in a portable binary format
 * and returns number of valid items (i.e., u,v,x,y,z data) found.
 * NB: There is nothing to check for the itersT[] array.
 */
static ulong readState
  ( u64 *itersT, u128 *uT, u128 *vT, poly128 *xT, poly128 *yT, int *zT
  ) {
    return 0;
} /* end readState */


/*== end of file ecdl2-97.c ================================================*/
/*
void *memset( void *s, int c, size_t n) {
    for(;n>0;n--)
	((char *)s)[n] = (char) c;
    return s;
}

void *memcpy(void *s1, const void *s2, size_t n) {
    for(;n>0;n--) 
	 ((char *)s1)[n] = ((char *)s2)[n];
    return s1;
}
*/
