save last working state

70f779fb · Armin Luntzer · e33bcc77 · 70f779fb · 70f779fb · 70f779fb
Commit 70f779fb authored 4 years ago by Armin Luntzer
--- a/arch/sparc/kernel/setup.c
+++ b/arch/sparc/kernel/setup.c
@@ -120,11 +120,11 @@ static void boot_cpus(void)
 	for (i = 1; i < CONFIG_SMP_CPUS_MAX; i++) {
-		printk("booting cpu %d\n", i);
+		pr_info("booting cpu %d\n", i);
 		cpu_wake(i);
 		while (!ioread32be(&cpu_ready[i]));
-		printk("cpu %d booted\n", i);
+		pr_info("cpu %d booted\n", i);
 	}
 }
@@ -141,7 +141,7 @@ void smp_cpu_entry(void)
 	arch_local_irq_enable();
-	printk("hi i'm cpu %d\n", leon3_cpuid());
+	pr_info("hi i'm cpu %d\n", leon3_cpuid());
 	BUG_ON(!leon3_cpuid());
 	/* signal ready */

--- a/include/kernel/kthread.h
+++ b/include/kernel/kthread.h
@@ -79,7 +79,11 @@ struct task_struct {
 	ktime				wakeup; /* start of next period */
 	ktime				deadline; /* deadline of current period */
+	ktime				create; /* start of next period */
+	ktime				wakeup_first;
 	ktime				exec_start;
+	ktime				exec_stop;
 	ktime				total;
 	unsigned long			slices;

--- a/init/Makefile
+++ b/init/Makefile
 obj-y += main.o
+obj-y += demo.o
+obj-y += demo_net.o
 obj-$(CONFIG_XENTIUM_PROC_DEMO) += xentium_demo.o
 obj-$(CONFIG_EMBED_MODULES_IMAGE) += modules-image.o
--- a/init/demo.c
+++ b/init/demo.c
-/**
- * This creates a number processing nodes in a processing network.
- * Two special trackers are used for input and output.
- */
 #include <kernel/kernel.h>
 #include <kernel/kmem.h>
 #include <kernel/kthread.h>
+#include <kernel/err.h>
+#include <kernel/smp.h>
+#include <asm/io.h>
-#include <data_proc_task.h>
-#include <data_proc_tracker.h>
-#include <data_proc_net.h>
-#define CRIT_LEVEL	10
-#define OP_ADD		0x1234
-#define OP_SUB		0x1235
-#define OP_MUL		0x1236
-#define STEPS	3
+static volatile double per_loop_avg[CONFIG_SMP_CPUS_MAX];
+static int copytask(void *data)
-int op_output(unsigned long op_code, struct proc_task *t)
 {
-	ssize_t i;
+#define BUFLEN	1024*1024
-	ssize_t n;
+	int i;
+	int cpu;
-	unsigned int *p = NULL;
+	int *go;
-	n = pt_get_nmemb(t);
-	printk("OUT: op code %d, %d items\n", op_code, n);
-	if (!n)
-		goto exit;
-	p = (unsigned int *) pt_get_data(t);
+	ktime cnt = 0;
-	if (!p)
+	ktime start, stop;
-		goto exit;
+	ktime total = 0;
+//	static uint32_t *common[CONFIG_SMP_CPUS_MAX];
+	static uint32_t *cpu_buf[CONFIG_SMP_CPUS_MAX];
-	for (i = 0; i < n; i++) {
-		printk("\t%d\n", p[i]);
-	}
-exit:
+	go = (int *) data;
-	kfree(p);	/* clean up our data buffer */
-	pt_destroy(t);
-	return PN_TASK_SUCCESS;
+	cpu = smp_cpu_id();
-}
+	cpu_buf[smp_cpu_id()] = kmalloc(BUFLEN * sizeof(uint32_t));
-int op_add(unsigned long op_code, struct proc_task *t)
-{
-	ssize_t i;
+	if (!cpu_buf[cpu])
-	ssize_t n;
+		return 0;
-	unsigned int *p;
+	(*go) = 1;	/* signal ready */
+	/* wait for trigger */
+	while (ioread32be(go) != CONFIG_SMP_CPUS_MAX);
-	n = pt_get_nmemb(t);
+	while (ioread32be(go)) {
+		start = ktime_get();
-	if (!n)
+		for (i = 0 ; i < BUFLEN; i++) {
-		return PN_TASK_SUCCESS;
+			cpu_buf[cpu][i] = cpu_buf[CONFIG_SMP_CPUS_MAX - cpu - 1][i];
+		}
+		stop = ktime_get();
-	p = (unsigned int *) pt_get_data(t);
+		total += stop - start;
-	if (!p)	/* we have elements but data is NULL, error*/
+		cnt++;
-		return PN_TASK_DESTROY;
-	printk("ADD: op code %d, %d items\n", op_code, n);
+		per_loop_avg[cpu] = ( ((double) total / (double) cnt) / (double) (BUFLEN));
-	for (i = 0; i < n; i++) {
-		p[i] += 10;
 	}
+	return 0;
-	return PN_TASK_SUCCESS;
 }
-int op_sub(unsigned long op_code, struct proc_task *t)
+int copy_resprint(void *data)
 {
+	int i;
+	int *go;
-	ssize_t i;
+	ktime start;
-	ssize_t n;
-	unsigned int *p;
+	double res[CONFIG_SMP_CPUS_MAX];
-	n = pt_get_nmemb(t);
+	go = (int *) data;
-	if (!n)
+	/* wait for trigger */
-		return PN_TASK_SUCCESS;
+	while (ioread32be(go) != CONFIG_SMP_CPUS_MAX);
+	start = ktime_get();
-	p = (unsigned int *) pt_get_data(t);
+	/* wait for about 60 seconds */
+	while (ktime_delta(ktime_get(), start) < ms_to_ktime(360 * 1000)) {
-	if (!p)	/* we have elements but data is NULL, error*/
+		for (i = 0; i < CONFIG_SMP_CPUS_MAX; i++)
-		return PN_TASK_DESTROY;
+			res[i] = per_loop_avg[i];
-	printk("SUB: op code %d, %d items\n", op_code, n);
+		printk("%g ", 0.001 * (double) ktime_to_ms(ktime_get()));
-	for (i = 0; i < n; i++) {
+		for (i = 0; i < CONFIG_SMP_CPUS_MAX; i++)
-		p[i] -= 2;
+			printk("%g ", res[i]);
+		printk("\n");
 	}
-	return PN_TASK_SUCCESS;
+	(*go) = 0; /* signal stop */
+	return 0;
 }
-int op_mul(unsigned long op_code, struct proc_task *t)
+int copybench_start(void)
 {
+	int i;
+	int go;
-	ssize_t i;
+	struct task_struct *t;
-	ssize_t n;
-	unsigned int *p;
-	n = pt_get_nmemb(t);
+	printk("COPYBENCH STARTING\n");
-	if (!n)
+	printk("Creating tasks, please stand by\n");
-		return PN_TASK_SUCCESS;
+	for (i = 0; i < CONFIG_SMP_CPUS_MAX; i++) {
+//	for (i = CONFIG_SMP_CPUS_MAX - 1; i >= 0; i--) {
-	p = (unsigned int *) pt_get_data(t);
+		go = 0;
-	if (!p)	/* we have elements but data is NULL, error*/
+		t = kthread_create(copytask, &go, i, "COPYTASK");
-		return PN_TASK_DESTROY;
-	printk("MUL: op code %d, %d items\n", op_code, n);
+		if (!IS_ERR(t)) {
+			/* allocate 95% of the cpu, period = 1s */
+			kthread_set_sched_edf(t, 1000 * 1000, 980 * 1000, 950 * 1000);
-	for (i = 0; i < n; i++) {
+			if (kthread_wake_up(t) < 0) {
-		p[i] *= 3;
+				printk("---- %s NOT SCHEDUL-ABLE---\n", t->name);
+				BUG();
 			}
+			while (!ioread32be(&go)); /* wait for task to become ready */
-	return PN_TASK_SUCCESS;
+		} else {
+			printk("Got an error in kthread_create!");
+			break;
 		}
+		printk("Copy task ready on cpu %d\n", i);
+	}
-int pn_prepare_nodes(struct proc_net *pn)
+	printk("Creating RR cpu-hopping printout task\n");
-{
-	struct proc_tracker *pt;
+	t = kthread_create(copy_resprint, &go, KTHREAD_CPU_AFFINITY_NONE, "PRINTTASK");
+	if (kthread_wake_up(t) < 0) {
+		printk("---- %s NOT SCHEDUL-ABLE---\n", t->name);
+		BUG();
+	}
-	/* create and add processing node trackers for the each operation */
+	printk("Triggering...\n");
-	pt = pt_track_create(op_add, OP_ADD, CRIT_LEVEL);
+	go = CONFIG_SMP_CPUS_MAX; /* set trigger */
-	BUG_ON(!pt);
+	sched_yield();
-	BUG_ON(pn_add_node(pn, pt));
-	pt = pt_track_create(op_sub, OP_SUB, CRIT_LEVEL);
+	while (ioread32be(&go)); /* wait for completion */
-	BUG_ON(!pt);
-	BUG_ON(pn_add_node(pn, pt));
-	pt = pt_track_create(op_mul, OP_MUL, CRIT_LEVEL);
+	printk("Average time to cross-copy buffers:\n");
-	BUG_ON(!pt);
-	BUG_ON(pn_add_node(pn, pt));
-	BUG_ON(pn_create_output_node(pn, op_output));
+	for (i = 0; i < CONFIG_SMP_CPUS_MAX; i++) {
+		printk("\tCPU %d: %ld ns per sample\n", per_loop_avg[i]);
+	}
+	printk("COPYBENCH DONE\n");
 	return 0;
 }
+int edftask(void *data)
-void pn_new_input_task(struct proc_net *pn, size_t n)
 {
-	struct proc_task *t;
-	static int seq;
 	int i;
-	unsigned int *data;
+	int loops = (* (int *) data);
-	t = pt_create(NULL, 0, STEPS, 0, seq++);
-	BUG_ON(!t);
-	BUG_ON(pt_add_step(t, OP_ADD, NULL));
-	BUG_ON(pt_add_step(t, OP_SUB, NULL));
-	BUG_ON(pt_add_step(t, OP_MUL, NULL));
-	data = kzalloc(sizeof(unsigned int) * n);
+	for (i = 0; i < loops; i++);
-	for (i = 0; i < n; i++)
-		data[i] = i;
-	pt_set_data(t, data, n * sizeof(unsigned int));
+	return i;
-	pt_set_nmemb(t, n);
-	pn_input_task(pn, t);
 }
-int demo(void *p __attribute__((unused)))
+int oneshotedf_start(void)
 {
-	struct proc_net *pn;
+	int i;
+	int loops = 1700000000;
-	printk("DEMO STARTING\n");
+	struct task_struct *t[CONFIG_SMP_CPUS_MAX];
-	pn = pn_create();
-	BUG_ON(!pn);
-	pn_prepare_nodes(pn);
+//	printk("EDF CREATE STARTING\n");
+//	printk("Creating tasks, please stand by\n");
-	pn_new_input_task(pn, 5);
+	for (i = 0; i < CONFIG_SMP_CPUS_MAX; i++) {
-	pn_new_input_task(pn, 0);
-	pn_new_input_task(pn, 3);
-	pn_process_inputs(pn);
+		t[i] = kthread_create(edftask, &loops, i, "EDFTASK");
-	while (pn_process_next(pn));
+		if (!IS_ERR(t)) {
+			/* creaate and launch edf thread */
+			kthread_set_sched_edf(t[i], 0, 100, 50);
-	pn_process_outputs(pn);
+			if (kthread_wake_up(t[i]) < 0) {
+				printk("---- %s NOT SCHEDUL-ABLE---\n", t[i]->name);
+				BUG();
+			}
+		} else {
+			printk("Got an error in kthread_create!");
+			break;
+		}
+//		printk("Copy task ready on cpu %d\n", i);
+	}
-	printk("DEMO COMPLETE\n");
+	sched_yield();
-	return 0;
+	printk("%lld\n", ktime_to_ms(ktime_get()));
-}
+	printk("Wakeup, creation, exec_start, exec_stop, deadline:\n");
+	for (i = 0; i < CONFIG_SMP_CPUS_MAX; i++) {
+		printk("\tCPU %d: %lld %lld %lld %lld %lld\n",
+		       i,
+		       ktime_to_us(ktime_delta(t[i]->wakeup_first,     t[i]->create)),
+		       ktime_to_us(ktime_delta(t[i]->wakeup,     t[i]->create)),
+		       ktime_to_us(ktime_delta(t[i]->exec_start, t[i]->create)),
+		       ktime_to_us(ktime_delta(t[i]->exec_stop, t[i]->create)),
+		       ktime_to_us(ktime_delta(t[i]->deadline,   t[i]->create)));
+	}
-void demo_start(void)
+	printk("COPYBENCH DONE\n");
-{
-	struct task_struct *t;
-	t = kthread_create(demo, NULL, KTHREAD_CPU_AFFINITY_NONE, "DEMO");
+	return 0;
-	/* allocate 98% of the cpu */
-	kthread_set_sched_edf(t, 100*1000, 99*1000, 98*1000);
-	if (kthread_wake_up(t) < 0)
-		printk("---- IASW NOT SCHEDULABLE---\n");
 }
--- a/init/demo_net.c
+++ b/init/demo_net.c
+/**
+ * This creates a number processing nodes in a processing network.
+ * Two special trackers are used for input and output.
+ */
+#include <kernel/kernel.h>
+#include <kernel/kmem.h>
+#include <kernel/kthread.h>
+#include <kernel/err.h>
+#include <kernel/smp.h>
+#include <asm/io.h>
+#include <data_proc_task.h>
+#include <data_proc_tracker.h>
+#include <data_proc_net.h>
+#define SRC_BUF_ELEM	(1024 * 256)
+#define COMPR_BUF_ELEM	(1024 * 256)
+#define CRIT_LEVEL	10
+#define OP_PREPROC_NLC		0x1234
+#define OP_DECORR_DIFF		0x1235
+#define OP_LOSSY3_ROUND2	0x1236
+#define OP_LLC_ARI1		0x1237
+struct CompressedBuf {
+	unsigned int datatype;
+	unsigned int llcsize;     /* compressed size (bytes) */
+	unsigned int nelements;   /* number of elements that went into llc */
+	unsigned int xelements;
+	unsigned int yelements;
+	unsigned int zelements;   /* number of frames, e.g. for imagettes */
+	unsigned int lossyCrc;    /* CRC after lossy steps */
+	void *data;
+};
+struct ScienceBuf {
+  unsigned int datatype;
+  unsigned int nelements;
+  unsigned int xelements;
+  unsigned int yelements;
+  unsigned int zelements;
+  void *data;
+};
+struct ProcData {
+	ktime start;
+	struct ScienceBuf    source;
+	struct ScienceBuf    swap;
+	struct CompressedBuf compressed;
+};
+/* A union which permits us to convert between a float and a 32 bit
+   int.  */
+typedef union
+{
+  float value;
+  unsigned int word;
+} ieee_float_shape_type;
+/* Get a 32 bit int from a float.  */
+#define GET_FLOAT_WORD(i,d)                                     \
+do {                                                            \
+  ieee_float_shape_type gf_u;                                   \
+  gf_u.value = (d);                                             \
+  (i) = gf_u.word;                                              \
+} while (0)
+/* Set a float from a 32 bit int.  */
+#define SET_FLOAT_WORD(d,i)                                     \
+do {                                                            \
+  ieee_float_shape_type sf_u;                                   \
+  sf_u.word = (i);                                              \
+  (d) = sf_u.value;                                             \
+} while (0)
+/*
+   from newlib/libm/common/sf_round.c
+   NOTE: this funny implementation does not make use of the sign bit, but works with signed data
+*/
+float roundf(float x)
+{
+  int w;
+  /* Most significant word, least significant word. */
+  int exponent_less_127;
+  GET_FLOAT_WORD(w, x);
+  /* Extract exponent field. */
+  exponent_less_127 = ((w & 0x7f800000) >> 23) - 127;
+  if (exponent_less_127 < 23)
+    {
+      if (exponent_less_127 < 0)
+        {
+          w &= 0x80000000;
+          if (exponent_less_127 == -1)
+            /* Result is +1.0 or -1.0. */
+            w |= (127 << 23);
+        }
+      else
+        {
+          unsigned int exponent_mask = 0x007fffff >> exponent_less_127;
+          if ((w & exponent_mask) == 0)
+            /* x has an integral value. */
+            return x;
+          w += 0x00400000 >> exponent_less_127;
+          w &= ~exponent_mask;
+        }
+    }
+  else
+    {
+      if (exponent_less_127 == 128)
+        /* x is NaN or infinite. */
+        return x + x;
+      else
+        return x;
+    }
+  SET_FLOAT_WORD(x, w);
+  return x;
+}
+/**
+ * @brief    apply bit rounding for unsigned integers in place
+ * @param    source    pointer to the input data
+ * @param    nbits     number of bits to round
+ * @param    n         number of samples to process
+ *
+ * @note     the result is right-shifted by nbits, but we round in float
+ */
+void BitRounding32u (unsigned int *source, unsigned int nbits, unsigned int n)
+{
+  unsigned int i;
+  unsigned int cellwidth;
+  float reciprocal;
+  if (nbits >= 32)
+    return;
+  cellwidth = 1u << nbits;
+  reciprocal = 1.0f / (float)cellwidth;
+  for (i=0; i < n; i++)
+    source[i] = (unsigned int)roundf((float)source[i] * reciprocal);
+  return;
+}
+#define SPLINE_SEGMENTS 28
+unsigned int rborder[SPLINE_SEGMENTS]; /* right borders of spline segment intervals */
+double A[SPLINE_SEGMENTS]; /* 0th order coefficients for nlcSplineCorr28 */
+double B[SPLINE_SEGMENTS]; /* 1st order coefficients for nlcSplineCorr28 */
+double C[SPLINE_SEGMENTS]; /* 2nd order coefficients for nlcSplineCorr28 */
+double D[SPLINE_SEGMENTS]; /* 3rd order coefficients for nlcSplineCorr28 */
+/**
+ * @brief    Function used by @ref NlcSplineCorr28. It returns the index of the right (=upper) border of the interval that the
+ *           given number belongs to. This function is designed to handle exactly 28 intervals.
+ * @param    value    the value which is sought within the intervals given by rborder
+ * @param    rb       an array of right (=upper) borders
+ * @note     the right border is assumed to belong to the interval
+ * @note     this is implemented as a bisection to be as fast as possible
+ *
+ * @returns  the index of the interval to which the input value belongs
+ */
+int GetInterval28 (unsigned int value, unsigned int *rb)
+{
+  int r=0;
+  if (value <= rb[13])
+    {
+      /* 0..13 */
+      if (value <= rb[6])
+	{
+	  /* 0..6 */
+	  if (value <= rb[3])
+	    {
+	      /* 0..3 */
+	      if (value <= rb[1])
+		{
+		  /* 0..1 */
+		  if (value <= rb[0])
+		    {
+		      /* 0 */
+		      r = 0;
+		    }
+		  else
+		    {
+		      /* 1 */
+		      r = 1;
+		    }
+		}
+	      else
+		{
+		  /* 2..3 */
+		  if (value <= rb[2])
+		    {
+		      /* 2 */
+		      r = 2;
+		    }
+		  else
+		    {
+		      /* 3 */
+		      r = 3;
+		    }
+		}
+	    }
+	  else
+	    {
+	      /* 4..6 */
+	      if (value <= rb[5])
+		{
+		  /* 4..5 */
+		  if (value <= rb[4])
+		    {
+		      /* 4 */
+		      r = 4;
+		    }
+		  else
+		    {
+		      /* 5 */
+		      r = 5;
+		    }
+		}
+	      else
+		{
+		  /* 6 */
+		  r = 6;
+		}
+	    }
+	}
+      else
+	{
+	  /* 7..13 */
+	  if (value <= rb[10])
+	    {
+	      /* 7..10 */
+	      if (value <= rb[8])
+		{
+		  /* 7..8 */
+		  if (value <= rb[7])
+		    {
+		      /* 7 */
+		      r = 7;
+		    }
+		  else
+		    {
+		      /* 8 */
+		      r = 8;
+		    }
+		}
+	      else
+		{
+		  /* 9..10 */
+		  if (value <= rb[9])
+		    {
+		      /* 9 */
+		      r = 9;
+		    }
+		  else
+		    {
+		      /* 10 */
+		      r = 10;
+		    }
+		}
+	    }
+	  else
+	    {
+	      /* 11..13 */
+	      if (value <= rb[12])
+		{
+		  /* 11..12 */
+		  if (value <= rb[11])
+		    {
+		      /* 11 */
+		      r = 11;
+		    }
+		  else
+		    {
+		      /* 12 */
+		      r = 12;
+		    }
+		}
+	      else
+		{
+		  /* 13 */
+		  r = 13;
+		}
+	    }
+	}
+    }
+  else
+    {
+      /* 14..27 */
+      if (value <= rb[20])
+	{
+	  /* 14..20 */
+	  if (value <= rb[17])
+	    {
+	      /* 14..17 */
+	      if (value <= rb[15])
+		{
+		  /* 14..15 */
+		  if (value <= rb[14])
+		    {
+		      /* 14 */
+		      r = 14;
+		    }
+		  else
+		    {
+		      /* 15 */
+		      r = 15;
+		    }
+		}
+	      else
+		{
+		  /* 16..17 */
+		  if (value <= rb[16])
+		    {
+		      /* 16 */
+		      r = 16;
+		    }
+		  else
+		    {
+		      /* 17 */
+		      r = 17;
+		    }
+		}
+	    }
+	  else
+	    {
+	      /* 18..20 */
+	      if (value <= rb[20])
+		{
+		  /* 18..19 */
+		  if (value <= rb[18])
+		    {
+		      /* 18 */
+		      r = 18;
+		    }
+		  else
+		    {
+		      /* 19 */
+		      r = 19;
+		    }
+		}
+	      else
+		{
+		  /* 20 */
+		  r = 20;
+		}
+	    }
+	}
+      else
+	{
+	  /* 21..27 */
+	  if (value <= rb[24])
+	    {
+	      /* 21..24 */
+	      if (value <= rb[22])
+		{
+		  /* 21..22 */
+		  if (value <= rb[21])
+		    {
+		      /* 21 */
+		      r = 21;
+		    }
+		  else
+		    {
+		      /* 22 */
+		      r = 22;
+		    }
+		}
+	      else
+		{
+		  /* .. */
+		  if (value <= rb[23])
+		    {
+		      /* 23 */
+		      r = 23;
+		    }
+		  else
+		    {
+		      /* 24 */
+		      r = 24;
+		    }
+		}
+	    }
+	  else
+	    {
+	      /* 25..27 */
+	      if (value <= rb[26])
+		{
+		  /* 25..26 */
+		  if (value <= rb[25])
+		    {
+		      /* 25 */
+		      r = 25;
+		    }
+		  else
+		    {
+		      /* 26 */
+		      r = 26;
+		    }
+		}
+	      else
+		{
+		  /* 27 */
+		  r = 27;
+		}
+	    }
+	}
+    }
+  return r;
+}
+/**
+ * @brief    Nonlinearity correction for the CHEOPS CCD readout values.
+ *           It uses a set of splines as correction function.
+ * @param[in,out]  data   the array of pixel values stored as unsigned ints;
+ *                        this will be overwritten by the corrected values
+ * @param          n      the number of pixel values to be corrected
+ * @note     overwrites input array
+ * @note     saturates the corrected values at 65535
+ */
+void NlcSplineCorr28 (unsigned int *data, unsigned int n)
+{
+  unsigned int i, value, rightBorderIndex;
+  double x, xx, xxx;
+  unsigned short utemp16 = 0;
+  float ftemp;
+  for (i=0; i < SPLINE_SEGMENTS; i++)
+    {
+	    ftemp = 1.0;
+//      CrIaCopyArrayItem (NLCBORDERS_ID, &utemp16, i);
+      rborder[i] = (unsigned int) utemp16;
+ //     CrIaCopyArrayItem (NLCCOEFF_A_ID, &ftemp, i);
+      A[i] = (double) ftemp;
+//      CrIaCopyArrayItem (NLCCOEFF_B_ID, &ftemp, i);
+      B[i] = (double) ftemp;
+//      CrIaCopyArrayItem (NLCCOEFF_C_ID, &ftemp, i);
+      C[i] = (double) ftemp;
+ //     CrIaCopyArrayItem (NLCCOEFF_D_ID, &ftemp, i);
+      D[i] = (double) ftemp;
+    }
+  for (i=0; i < n; i++)
+    {
+      value = data[i];
+      /* get the index of the right border of the interval the current value belongs to */
+      rightBorderIndex = GetInterval28 (value, rborder);
+      /* The spline coefficients assume that x starts at 0 within the interval,
+	 but our x counts from 0 to 65k, so we have to shift the x axis for
+	 every interval back to zero by subtracting the left border.
+	 The first interval starts at 0, so nothing has to be done for it. */
+      if (rightBorderIndex != 0)
+	{
+	  x = (double) (value - rborder[rightBorderIndex-1]);
+	}
+      else
+	{
+	  x = (double) value;
+	}
+      /* this saves one multiplication */
+      xx = x*x;
+      xxx = x*xx;
+      x = D[rightBorderIndex]*xxx + C[rightBorderIndex]*xx + B[rightBorderIndex]*x + A[rightBorderIndex];
+      /* the result is not truncated to integer, but rounded with the help of the inbuilt fdtoi instruction */
+      value = (unsigned int) roundf ((float)x);
+      /* saturate a corrected value at 16 bits */
+      if (value > 0xffff)
+	value = 0xffff;
+      data[i] = value;
+    }
+  return;
+}
+/**
+ * @brief       reversible differencing of a buffer
+ * @param       buf     an integer pointer to a buffer
+ * @param       words   number of values to process
+ *
+ * Differences are made in place, from bottom to top
+ * @note        Is applied in place.
+ */
+void Delta32 (int *buf, int words)
+{
+  int i;
+  for (i=words-1; i>0; i--)
+    {
+      buf[i] = (buf[i] - buf[i-1]);
+    }
+  return;
+}
+/**
+ * @brief       fold negative values into positive, interleaving the positive ones
+ * @param       buffer     an integer pointer to a buffer
+ * @param       N          number of values to process
+ *
+ * @note        Is applied in place.
+ */
+void Map2Pos32 (int *buffer, unsigned int N)
+{
+  unsigned int i;
+  for (i=0; i < N; i++)
+    {
+      if (buffer[i] < 0)
+        buffer[i] = ((0xFFFFFFFF - buffer[i]) << 1) + 1; /* NOTE: the integer overflow is intended */
+      else
+        buffer[i] = buffer[i] << 1;
+    }
+  return;
+}
+/**
+ * @brief    safe (but slow) way to put the value of a single bit into a bitstream accessed as 32-bit RAM
+ *           in big endian
+ * @param    value      the value to put, either 0 or 1
+ * @param    bitOffset  index of the bit as seen from the very beginning of the bitstream
+ * @param    destAddr   this is the pointer to the beginning of the bitstream
+ * @note     Do not use values like 23 and assume that the LSB will be set. It won't.
+ * @note     works in SRAM2
+ */
+void PutBit32 (unsigned int value, unsigned int bitOffset, unsigned int *destAddr)
+{
+  unsigned int wordpos, bitpos;
+  unsigned int destval, mask;
+  wordpos = bitOffset >> 5; /* division by 32 */
+  /*  bitpos = bitOffset - 32*wordpos; */
+  bitpos = bitOffset & 0x1f; /* 5 bits */
+  /* shape a mask with the required bit set true */
+  mask = 1 << (31-bitpos);
+  /* get the destination word and clear the bit */
+  destval = destAddr[wordpos];
+  destval &= ~mask;
+  /* set bit if the value was true */
+  if (value == 1)
+    destval |= mask;
+  /* write it back */
+  destAddr[wordpos] = destval;
+  return;
+}
+/**
+ * @brief    safe (but slow) way to put the value of up to 32 bits into a bitstream accessed as 32-bit RAM
+ *           in big endian
+ * @param    value      the value to put, it will be masked
+ * @param    bitOffset  bit index where the bits will be put, seen from the very beginning of the bitstream
+ * @param    nBits      number of bits to put
+ * @param    destAddr   this is the pointer to the beginning of the bitstream
+ * @returns  number of bits written or 0 if the number was too big
+ * @note     works in SRAM2
+ */
+unsigned int PutNBits32 (unsigned int value, unsigned int bitOffset, unsigned int nBits, unsigned int *destAddr)
+{
+  unsigned int *localAddr;
+  unsigned int bitsLeft, shiftRight, shiftLeft, localEndPos;
+  unsigned int mask, n2;
+  /* leave in case of erroneous input */
+  if (nBits == 0)
+    return 0;
+  if (nBits > 32)
+    return 0;
+  /* separate the bitOffset into word offset (set localAddr pointer) and local bit offset (bitsLeft) */
+  localAddr = destAddr + (bitOffset >> 5);
+  bitsLeft = bitOffset & 0x1f;
+  /* (M) we mask the value first to match its size in nBits */
+  /* the calculations can be re-used in the unsegmented code, so we have no overhead */
+  shiftRight = 32 - nBits;
+  mask = 0xffffffff >> shiftRight;
+  value &= mask;
+  /* to see if we need to split the value over two words we need the right end position */
+  localEndPos = bitsLeft + nBits;
+  if (localEndPos <= 32)
+    {
+      /*         UNSEGMENTED
+       |-----------|XXXXX|----------------|
+          bitsLeft    n       bitsRight
+      -> to get the mask:
+      shiftRight = bitsLeft + bitsRight = 32 - n
+      shiftLeft = bitsRight
+      */
+      /* shiftRight = 32 - nBits; */ /* see (M) above! */
+      shiftLeft = shiftRight - bitsLeft;
+      /* generate the mask, the bits for the values will be true */
+      /* mask = (0xffffffff >> shiftRight) << shiftLeft; */ /* see (M) above! */
+      mask <<= shiftLeft;
+      /* clear the destination with inverse mask */
+      *(localAddr) &= ~mask;
+      /* assign the value */
+      *(localAddr) |= (value << (32-localEndPos)); /* NOTE: 32-localEndPos = shiftLeft can be simplified */
+    }
+  else
+    {
+      /*                             SEGMENTED
+       |-----------------------------|XXX| |XX|------------------------------|
+                 bitsLeft              n1   n2          bitsRight
+       -> to get the mask part 1:
+       shiftright = bitsleft
+       n1 = n - (bitsleft + n - 32) = 32 - bitsleft
+       -> to get the mask part 2:
+       n2 = bitsleft + n - 32
+       shiftleft = 32 - n2 = 32 - (bitsleft + n - 32) = 64 - bitsleft - n
+      */
+      n2 = bitsLeft + nBits - 32;
+      /* part 1: */
+      shiftRight = bitsLeft;
+      mask = 0xffffffff >> shiftRight;
+      /* clear the destination with inverse mask */
+      *(localAddr) &= ~mask;
+      /* assign the value part 1 */
+      *(localAddr) |= (value >> n2);
+      /* part 2: */
+      /* adjust address */
+      localAddr += 1;
+      shiftLeft = 64 - bitsLeft - nBits;
+      mask = 0xffffffff << shiftLeft;
+      /* clear the destination with inverse mask */
+      *(localAddr) &= ~mask;
+      /* assign the value part 2 */
+      *(localAddr) |= (value << (32-n2));
+    }
+  return nBits;
+}
+/**
+ * ARI parameters
+ */
+#define ARIHDR 2
+#define FMARIROWS 256
+#define FMARISPILL 257
+#define MAXFREQ 8192
+#define SPILLCUT FMARIROWS
+#define PHOT_STANDARD 0
+/**
+ * @brief structure used by the @ref fmari_compress algorithm
+ */
+struct arimodel {
+  unsigned int *freqtable;
+  unsigned int *cptable;   /* cumulative probability table */
+  unsigned int *ncptable;  /* next cumulative probability table */
+  unsigned int *spillover; /* swap buffer for spillover, i.e. words > 8 Bit */
+  int spillctr;
+  int probability;
+};
+/**
+ * This number defines the number of bits used for the codeword in the @ref vbwl_midsize
+ * algorithm, which is located at the start of each group and says how many bits are used for
+ * the symbols in this group.
+ *
+ * set to 3 if you are sure that there are no larger values than 16 bits (the length code L = 9..16
+ * will be encoded as L-VBWLMINW = C = 0..7 in 4 bits. Each symbol of the group will be encoded in L bits)
+ *
+ * set to 4 for a cap of 24 bits, set to 5 for a cap of 40 bits
+ *
+ * @warning Larger values than what you set as cap will corrupt the output stream
+ *          and it would be hard to decode such a stream.
+ *
+ */
+#define VBWLCODE 5
+/**
+ * The minimum number of bits to encode a spillover value is 9,
+ * because if it was 8, it would not have landed in the spill.
+ * There is one exception, because the bias @ref FMARIROWS is subtracted
+ * from the spill before calling the @ref vbwl_midyize function.
+ * This leaves a small range of values to get a width of < 9, but
+ * at present vbwl does not make efficient use of it and encodes them in 9 bits.
+ */
+#define VBWLMINW 9
+/**
+ * @brief    initialize the model table for arithmetic compression (internal function)
+ * @param    model    pointer to the @ref arimodel structure
+ * @param    buffer   pointer to the buffer where the table will reside
+ * @param    symbols  number of symbols covered by the model (i.e. the number of histogram bins)
+ */
+void init_arimodel (struct arimodel *model, unsigned int *buffer, int symbols)
+{
+  /* note: symbols is counted here without the spill probability */
+  model->freqtable   = buffer;
+  model->cptable     = model->freqtable + symbols + 1;  /* cumulative probability table  */
+  model->ncptable    = model->cptable + 1;              /* next cumulative probability table */
+  model->spillover   = model->ncptable + symbols + 1;   /* swap buffer for spillover, i.e. words > 8 Bit    */
+  model->spillctr    = 0;
+  model->probability = 0;
+  return;
+}
+/**
+ * @brief    initialize the cumulative frequency in the model table for arithmetic compression (internal function)
+ * @param    table    pointer to the frequency table of the @ref arimodel (freqtable)
+ * @param    cumu     pointer to the cumulative frequency table of the @ref arimodel (cptable)
+ * @param    nrows    number of symbols covered by the model (i.e. the number of histogram bins)
+ * @returns  last value of the cumulative table, i.e. the number of samples, the sum of the histogram
+ */
+int makeCumulative (unsigned int *table, unsigned int nrows, unsigned int *cumu)
+{
+  unsigned int ctr;
+  for (ctr=0; ctr < nrows; ctr++) /* clean table for the "cumulative probabilities" */
+    cumu[ctr] = 0;
+  for (ctr=0; ctr < nrows; ctr++) /* the new table is +1 in size !!     */
+    cumu[ctr+1] = cumu[ctr] + table[ctr];
+  return cumu[nrows];
+}
+/**
+ * @brief    create a new model from a histogram of a buffer
+ * @param    buffer   pointer to the buffer of samples to make the histogram
+ * @param    nwords   number of samples in that buffer
+ * @param    symbols  number of samples to skip at the beginning of the buffer (where the spillover values are)
+ * @param    initval  bias value used in every histogram bin. We recommend 1.
+ * @param    model    pointer to the @ref armodel structure
+ */
+void update_fm_ari_model (unsigned int *buffer, unsigned int nwords, unsigned int symbols, int initval, struct arimodel *model)
+{
+  unsigned int ctr;
+  unsigned int value;
+  /* start model with 0 or 1 in every entry -> smooth */
+  for (ctr=0; ctr < FMARISPILL; ctr++)
+    model->freqtable[ctr] = initval;
+  /* count freqs over the buffer, but leave out the first FMACROWS words for smoothing   */
+  for (ctr=symbols; ctr < nwords; ctr++)
+    {
+      value = buffer[ctr];
+      /*SDPRINT ("updatemodel [%d] = %d\n", ctr, buffer[ctr]); */
+      if (value < FMARIROWS)
+        model->freqtable[value]++;
+      else
+        model->freqtable[FMARIROWS]++; /* spillover */
+    }
+  /* make new (n)cp array */
+  model->probability = makeCumulative (model->freqtable, FMARISPILL, model->cptable);
+  return;
+}
+/**
+ * @brief    set the initial values for the arithemtic compression model (histogram) for the first chunk
+ * @param    destmodel    pointer to the histogram buffer in the @ref arimodel
+ * @param    ModeID       select which model to use
+ * @note     this is still from PACS, need a CHEOPS statistic here
+ */
+void initAriTable (int *destmodel, int ModeID)
+{
+  switch ( ModeID )
+    {
+    case ( PHOT_STANDARD ) :
+    default :
+    {
+      /* startmodel for default full-frame compression */
+      destmodel[0] = 201;
+      destmodel[1] = 200;
+      destmodel[2] = 200;
+      destmodel[3] = 197;
+      destmodel[4] = 199;
+      destmodel[5] = 194;
+      destmodel[6] = 195;
+      destmodel[7] = 190;
+      destmodel[8] = 192;
+      destmodel[9] = 184;
+      destmodel[10] = 186;
+      destmodel[11] = 178;
+      destmodel[12] = 181;
+      destmodel[13] = 172;
+      destmodel[14] = 174;
+      destmodel[15] = 165;
+      destmodel[16] = 167;
+      destmodel[17] = 157;
+      destmodel[18] = 160;
+      destmodel[19] = 150;
+      destmodel[20] = 153;
+      destmodel[21] = 143;
+      destmodel[22] = 145;
+      destmodel[23] = 134;
+      destmodel[24] = 138;
+      destmodel[25] = 127;
+      destmodel[26] = 130;
+      destmodel[27] = 120;
+      destmodel[28] = 123;
+      destmodel[29] = 113;
+      destmodel[30] = 116;
+      destmodel[31] = 107;
+      destmodel[32] = 109;
+      destmodel[33] = 99;
+      destmodel[34] = 102;
+      destmodel[35] = 93;
+      destmodel[36] = 95;
+      destmodel[37] = 87;
+      destmodel[38] = 89;
+      destmodel[39] = 81;
+      destmodel[40] = 83;
+      destmodel[41] = 75;
+      destmodel[42] = 78;
+      destmodel[43] = 70;
+      destmodel[44] = 72;
+      destmodel[45] = 65;
+      destmodel[46] = 67;
+      destmodel[47] = 60;
+      destmodel[48] = 62;
+      destmodel[49] = 56;
+      destmodel[50] = 58;
+      destmodel[51] = 51;
+      destmodel[52] = 54;
+      destmodel[53] = 47;
+      destmodel[54] = 49;
+      destmodel[55] = 44;
+      destmodel[56] = 46;
+      destmodel[57] = 40;
+      destmodel[58] = 42;
+      destmodel[59] = 37;
+      destmodel[60] = 39;
+      destmodel[61] = 34;
+      destmodel[62] = 36;
+      destmodel[63] = 31;
+      destmodel[64] = 33;
+      destmodel[65] = 28;
+      destmodel[66] = 30;
+      destmodel[67] = 26;
+      destmodel[68] = 28;
+      destmodel[69] = 24;
+      destmodel[70] = 26;
+      destmodel[71] = 22;
+      destmodel[72] = 23;
+      destmodel[73] = 20;
+      destmodel[74] = 22;
+      destmodel[75] = 18;
+      destmodel[76] = 19;
+      destmodel[77] = 16;
+      destmodel[78] = 18;
+      destmodel[79] = 15;
+      destmodel[80] = 16;
+      destmodel[81] = 14;
+      destmodel[82] = 15;
+      destmodel[83] = 12;
+      destmodel[84] = 14;
+      destmodel[85] = 11;
+      destmodel[86] = 12;
+      destmodel[87] = 10;
+      destmodel[88] = 11;
+      destmodel[89] = 10;
+      destmodel[90] = 10;
+      destmodel[91] = 9;
+      destmodel[92] = 9;
+      destmodel[93] = 8;
+      destmodel[94] = 9;
+      destmodel[95] = 7;
+      destmodel[96] = 8;
+      destmodel[97] = 7;
+      destmodel[98] = 7;
+      destmodel[99] = 6;
+      destmodel[100] = 7;
+      destmodel[101] = 5;
+      destmodel[102] = 6;
+      destmodel[103] = 5;
+      destmodel[104] = 5;
+      destmodel[105] = 4;
+      destmodel[106] = 5;
+      destmodel[107] = 4;
+      destmodel[108] = 5;
+      destmodel[109] = 4;
+      destmodel[110] = 4;
+      destmodel[111] = 4;
+      destmodel[112] = 4;
+      destmodel[113] = 3;
+      destmodel[114] = 4;
+      destmodel[115] = 3;
+      destmodel[116] = 3;
+      destmodel[117] = 3;
+      destmodel[118] = 3;
+      destmodel[119] = 3;
+      destmodel[120] = 3;
+      destmodel[121] = 2;
+      destmodel[122] = 3;
+      destmodel[123] = 2;
+      destmodel[124] = 2;
+      destmodel[125] = 2;
+      destmodel[126] = 2;
+      destmodel[127] = 2;
+      destmodel[128] = 2;
+      destmodel[129] = 2;
+      destmodel[130] = 2;
+      destmodel[131] = 2;
+      destmodel[132] = 2;
+      destmodel[133] = 2;
+      destmodel[134] = 2;
+      destmodel[135] = 2;
+      destmodel[136] = 2;
+      destmodel[137] = 2;
+      destmodel[138] = 2;
+      destmodel[139] = 2;
+      destmodel[140] = 2;
+      destmodel[141] = 1;
+      destmodel[142] = 2;
+      destmodel[143] = 1;
+      destmodel[144] = 2;
+      destmodel[145] = 1;
+      destmodel[146] = 2;
+      destmodel[147] = 1;
+      destmodel[148] = 1;
+      destmodel[149] = 1;
+      destmodel[150] = 1;
+      destmodel[151] = 1;
+      destmodel[152] = 1;
+      destmodel[153] = 1;
+      destmodel[154] = 1;
+      destmodel[155] = 1;
+      destmodel[156] = 1;
+      destmodel[157] = 1;
+      destmodel[158] = 1;
+      destmodel[159] = 1;
+      destmodel[160] = 1;
+      destmodel[161] = 1;
+      destmodel[162] = 1;
+      destmodel[163] = 1;
+      destmodel[164] = 1;
+      destmodel[165] = 1;
+      destmodel[166] = 1;
+      destmodel[167] = 1;
+      destmodel[168] = 1;
+      destmodel[169] = 1;
+      destmodel[170] = 1;
+      destmodel[171] = 1;
+      destmodel[172] = 1;
+      destmodel[173] = 1;
+      destmodel[174] = 1;
+      destmodel[175] = 1;
+      destmodel[176] = 1;
+      destmodel[177] = 1;
+      destmodel[178] = 1;
+      destmodel[179] = 1;
+      destmodel[180] = 1;
+      destmodel[181] = 1;
+      destmodel[182] = 1;
+      destmodel[183] = 1;
+      destmodel[184] = 1;
+      destmodel[185] = 1;
+      destmodel[186] = 1;
+      destmodel[187] = 1;
+      destmodel[188] = 1;
+      destmodel[189] = 1;
+      destmodel[190] = 1;
+      destmodel[191] = 1;
+      destmodel[192] = 1;
+      destmodel[193] = 1;
+      destmodel[194] = 1;
+      destmodel[195] = 1;
+      destmodel[196] = 1;
+      destmodel[197] = 1;
+      destmodel[198] = 1;
+      destmodel[199] = 1;
+      destmodel[200] = 1;
+      destmodel[201] = 1;
+      destmodel[202] = 1;
+      destmodel[203] = 1;
+      destmodel[204] = 1;
+      destmodel[205] = 1;
+      destmodel[206] = 1;
+      destmodel[207] = 1;
+      destmodel[208] = 1;
+      destmodel[209] = 1;
+      destmodel[210] = 1;
+      destmodel[211] = 1;
+      destmodel[212] = 1;
+      destmodel[213] = 1;
+      destmodel[214] = 1;
+      destmodel[215] = 1;
+      destmodel[216] = 1;
+      destmodel[217] = 1;
+      destmodel[218] = 1;
+      destmodel[219] = 1;
+      destmodel[220] = 1;
+      destmodel[221] = 1;
+      destmodel[222] = 1;
+      destmodel[223] = 1;
+      destmodel[224] = 1;
+      destmodel[225] = 1;
+      destmodel[226] = 1;
+      destmodel[227] = 1;
+      destmodel[228] = 1;
+      destmodel[229] = 1;
+      destmodel[230] = 1;
+      destmodel[231] = 1;
+      destmodel[232] = 1;
+      destmodel[233] = 1;
+      destmodel[234] = 1;
+      destmodel[235] = 1;
+      destmodel[236] = 1;
+      destmodel[237] = 1;
+      destmodel[238] = 1;
+      destmodel[239] = 1;
+      destmodel[240] = 1;
+      destmodel[241] = 1;
+      destmodel[242] = 1;
+      destmodel[243] = 1;
+      destmodel[244] = 1;
+      destmodel[245] = 1;
+      destmodel[246] = 1;
+      destmodel[247] = 1;
+      destmodel[248] = 1;
+      destmodel[249] = 1;
+      destmodel[250] = 1;
+      destmodel[251] = 1;
+      destmodel[252] = 1;
+      destmodel[253] = 1;
+      destmodel[254] = 1;
+      destmodel[255] = 1;
+      destmodel[256] = 131; /* NOTE: spillover, yes this table has 257 entries! */
+      break;
+    }
+    }
+  return;
+}
+/**
+ *
+ * these variables are shared among the core coding functions of fmari
+ *
+ *@{*/
+/** lower bound of local encoding interval */
+unsigned int fm_ari_low[CONFIG_SMP_CPUS_MAX];
+/** upper bound of local encoding interval */
+//unsigned int fm_ari_high[CONFIG_SMP_CPUS_MAX] =  {0xffff, 0xfff, 0xffff, 0xffff};
+unsigned int fm_ari_high[CONFIG_SMP_CPUS_MAX]; /* XXX */
+/** flag to signal underflow */
+unsigned int fm_ari_underflow[CONFIG_SMP_CPUS_MAX];
+/** the write counter for the output bitstream */
+unsigned int fm_ari_wctr[CONFIG_SMP_CPUS_MAX];
+/**@}*/
+/**
+ * @brief calculate the new interval and output bits to the bitstream if necessary
+ * @param dest    pointer to the base of the output bitstream
+ * @param cp      the cumulative probability of that value (taken from the @ref arimodel)
+ * @param ncp     the next cumulative probability of that value (taken from the @ref arimodel)
+ * @par Globals
+ * @ref fm_ari_low[smp_cpu_id()], @ref fm_ari_high[smp_cpu_id()], @ref fm_ari_underflow[smp_cpu_id()], @ref fm_ari_wctr[smp_cpu_id()]
+ */
+void fmari_encodeSym8k (unsigned int *dest, unsigned int cp, unsigned int ncp)
+{
+  unsigned int width;
+  unsigned int a;
+  /* calculate the new interval */
+  width = (fm_ari_high[smp_cpu_id()] - fm_ari_low[smp_cpu_id()]) + 1;
+  fm_ari_high[smp_cpu_id()] = fm_ari_low[smp_cpu_id()] + ((ncp * width) >> 13) - 1; /* L + Pni * (H - L) */
+  fm_ari_low[smp_cpu_id()]  = fm_ari_low[smp_cpu_id()] + ((cp * width) >> 13);       /* L + Pci * (H - L) */
+  for ( ; ; )
+    {
+      a = fm_ari_high[smp_cpu_id()] & 0x8000;
+      /* write out equal bits */
+      if (a == (fm_ari_low[smp_cpu_id()] & 0x8000))
+        {
+          PutBit32 (a >> 15, fm_ari_wctr[smp_cpu_id()]++, dest);
+          while (fm_ari_underflow[smp_cpu_id()] > 0)
+            {
+              PutBit32 ((~fm_ari_high[smp_cpu_id()] & 0x8000) >> 15, fm_ari_wctr[smp_cpu_id()]++, dest);
+              fm_ari_underflow[smp_cpu_id()]--;
+            }
+        }
+      /* underflow coming up, because <> and the 2nd bits are just one apart       */
+      else if ((fm_ari_low[smp_cpu_id()] & 0x4000) && !(fm_ari_high[smp_cpu_id()] & 0x4000))
+        {
+          fm_ari_underflow[smp_cpu_id()]++;
+          fm_ari_low[smp_cpu_id()]  &= 0x3fff;
+          fm_ari_high[smp_cpu_id()] |= 0x4000;
+        }
+      else
+        {
+          return;
+        }
+      fm_ari_low[smp_cpu_id()]  <<= 1;
+      fm_ari_low[smp_cpu_id()]   &= 0xffff;
+      fm_ari_high[smp_cpu_id()] <<= 1;
+      fm_ari_high[smp_cpu_id()]  |= 1;
+      fm_ari_high[smp_cpu_id()]  &= 0xffff;
+    }
+  /* the return is inside the for loop */
+}
+/**
+ * @brief at the end of an encoding chunk, flush out necessary remaining bits
+ * @param dest    pointer to the base of the output bitstream
+ * @par Globals
+ * @ref fm_ari_low[smp_cpu_id()], @ref fm_ari_underflow[smp_cpu_id()], @ref fm_ari_wctr[smp_cpu_id()]
+ */
+void fmari_flushEncoder (unsigned int *dest)
+{
+  PutBit32 ((fm_ari_low[smp_cpu_id()] & 0x4000) >> 14, fm_ari_wctr[smp_cpu_id()]++, dest);
+  fm_ari_underflow[smp_cpu_id()]++;
+  while (fm_ari_underflow[smp_cpu_id()]-- > 0)
+    PutBit32 ((~fm_ari_low[smp_cpu_id()] & 0x4000) >> 14, fm_ari_wctr[smp_cpu_id()]++, dest);
+  return;
+}
+/**
+ * @brief encode a chunk of symbols to an output bitstream. Spillover values are saved in the @ref arimodel's dedicated buffer
+ * @param chunk   pointer to the input data
+ * @param chunksize  number of symbols in this chunk, best use @ref MAXFREQ (or less if the chunk is smaller)
+ * @param dest    pointer to the base of the output bitstream of that chunk segment
+ * @param model   pointer to the @ref arimodel structure
+ * @par Globals
+ * A number of (local) globals are initialized here
+ * @ref fm_ari_low[smp_cpu_id()], @ref fm_ari_high[smp_cpu_id()], @ref fm_ari_underflow[smp_cpu_id()], @ref fm_ari_wctr[smp_cpu_id()]
+ * @note  make the (local) globales either static or move to arimodel or pass as arguments (or live with it)
+ */
+int fmari_encode_chunk (int *chunk, int chunksize, int *dest, struct arimodel *model)
+{
+  int ctr, tail;
+  unsigned int symbol, cp, ncp;
+  /* now init ari */
+  fm_ari_low[smp_cpu_id()]       = 0;
+  fm_ari_high[smp_cpu_id()]      = 0xffff;
+  fm_ari_underflow[smp_cpu_id()] = 0;
+  fm_ari_wctr[smp_cpu_id()]      = 32; /* offset for chunksize_w */
+  for (ctr=0; ctr < chunksize; ctr++)
+    {
+      symbol = chunk[ctr]; /* get next symbol */
+      /* look it up in the tables */
+      /* first we check for spillover */
+      if (symbol >= SPILLCUT)
+        {
+          /* encode spillover signal in ari stream */
+          cp = model->cptable[FMARIROWS];
+          ncp = model->ncptable[FMARIROWS];
+          fmari_encodeSym8k ((unsigned int *) dest, cp, ncp);
+          /* put the symbol into the spillover buffer and increment counter  */
+          model->spillover[(model->spillctr)++] = symbol;
+        }
+      else /* valid symbol */
+        {
+          cp = model->cptable[symbol];
+          ncp = model->ncptable[symbol];
+          fmari_encodeSym8k ((unsigned int *)dest, cp, ncp);
+        }
+    }
+  /* encode the rest */
+  fmari_flushEncoder ((unsigned int *) dest);
+  /* calc fillup and fill up with 0s */
+  tail = (32 - fm_ari_wctr[smp_cpu_id()] % 32) % 32;
+  fm_ari_wctr[smp_cpu_id()] += tail;
+  dest[0] = (fm_ari_wctr[smp_cpu_id()] / 32);
+  return dest[0]; /* now in words  */
+}
+unsigned int bits_used (unsigned int num)
+{
+  unsigned int u;
+  for (u=0; num != 0; u++)
+    {
+      num >>= 1;
+    }
+  return u;
+}
+/**
+ * @brief variable block word length encoding. Used for the spillover in FmAri
+ * @param source    pointer to the input data
+ * @param words     number of symbols to encode
+ * @param dest      pointer to the base of the output bitstream
+ * @param BS        block size, i.e. how many symbols are put into a group
+ * @note  this function is the weak point of the FmAri (ARI1) implementation.
+ *        Ok, it has worked for Herschel, where we had very few spill counts, but we want to get rid of it in CHEOPS.
+ * @returns  size in 32-bit words of the output stream, rounded up
+ */
+int vbwl_midsize (int *source, int words, int *dest, int BS)
+{
+  int ctr, bctr;
+  int bits, width;
+  int outbits = 32; /* keep track of the output bits; we start at dest[1] */
+  /* main loop counts through the source words */
+  for (ctr=0; ctr < words; ctr++)
+    {
+      /* block-loop, we count through the words of a block */
+      for (width=0, bctr=ctr; (bctr < ctr+BS) & (bctr < words); bctr++)
+        {
+          /* determine used bits of current word */
+          /* bits = 32-lzeros */
+	  bits = bits_used(((unsigned int *)source)[bctr]);
+          width = bits > width ? bits : width;
+        }
+      /* now we know width = the number of bits to encode the block */
+      /* first code the width */
+      if (width < VBWLMINW) /* ... due to the optional -FMARIROWS */
+        width = VBWLMINW;
+      /* use VBWLCODE bits for the encoding of the width */
+      PutNBits32(width-VBWLMINW, outbits, VBWLCODE, (unsigned int *) dest);
+      outbits += VBWLCODE;
+      /* now code the words of the block in width bits */
+      for (bctr=ctr; (ctr < bctr+BS) & (ctr < words); ctr++)
+        {
+          PutNBits32 (source[ctr], outbits, width, (unsigned int *)dest);
+          outbits += width;
+        }
+      ctr--;
+    }
+  /* store the original size */
+  dest[0] = words;
+  /* return the size in words, rounding up */
+  return (outbits+31)/32;
+}
+/**
+ * @brief The FM Arithmetic compression function. ARI1 in CHEOPS-slang.
+ * @param source    pointer to the input data.
+ * @param nwords    number of symbols to encode
+ * @param dest      pointer to the base of the output bitstream
+ * @param swap      a working buffer is needed with a size of (strictly speaking) nwords+257+258 words,
+ *                  but if you can guess the spillcount, use spillcount+257+258
+ * @param modeltype initial probability model to start with. Choose from @ref initAriTable
+ * @returns  size in 32-bit words of the output stream, rounded up
+ * @note  The spillover is encoded with @ref vbwl_midsize and that algorithm is quite inefficient.
+ *        Ok, it is difficult to encode the spill, but that algorithm really does a bad job at it.
+ *        In particular, the @ref VBWLCODE define is limiting the range of values.
+ */
+int fmari_compress (unsigned int *source, unsigned int nwords, unsigned int *dest, unsigned int *swap, unsigned int modeltype)
+{
+  int ctr;
+  int src_ctr;
+  int remaining;
+  int *streamctr_w;
+  unsigned int *stream;
+  struct arimodel model;
+  init_arimodel (&model, swap, FMARIROWS);
+  dest[0]   = nwords; /* store original size in words */
+  remaining = nwords;
+  src_ctr = 0;
+  streamctr_w  = (int *) (dest + 1); /* here the size of the ari stream in words will grow */
+  *streamctr_w = 0;                     /* we start with 2, because we have the osize and the streamctr */
+  stream = dest + ARIHDR; /* set dest stream and counter */
+  initAriTable((int *) model.freqtable, modeltype);  /* initialize starting model */
+  /* make probability model */
+  model.probability = makeCumulative(model.freqtable, FMARISPILL, model.cptable);
+  /* start compressing chunks with initial model     */
+  while (remaining > MAXFREQ)
+    {
+      *streamctr_w += fmari_encode_chunk((int *)(source + src_ctr), MAXFREQ, \
+                                         (int *)(stream + *streamctr_w), &model);
+      /* derive new model from current data */
+      update_fm_ari_model (source + src_ctr, MAXFREQ, FMARISPILL, 1, &model);
+      src_ctr   += MAXFREQ;
+      remaining -= MAXFREQ;
+    }
+  /* encode the last chunk */
+  if (remaining > 0)
+    *streamctr_w += fmari_encode_chunk ((int *)(source + src_ctr), remaining, \
+                                        (int *)(stream + *streamctr_w), &model);
+  /* .. treat the spillover here */
+  /* subtract FMARIROWS from the spill values */
+  for (ctr=0; ctr < model.spillctr; ctr++)
+    model.spillover[ctr] -= FMARIROWS;
+  model.spillctr = vbwl_midsize ((int *) model.spillover, model.spillctr, \
+                                 (int *)(dest + ARIHDR + (*streamctr_w)), 4);
+  return (int)(ARIHDR + *streamctr_w + model.spillctr);
+}
+int op_output(unsigned long op_code, struct proc_task *t)
+{
+	ssize_t n;
+	struct ProcData *p = NULL;
+	n = pt_get_nmemb(t);
+	if (!n)
+		goto exit;
+	p = (struct ProcData *) pt_get_data(t);
+	if (!p)
+		goto exit;
+	if (smp_cpu_id() == 0) 
+	{
+	printk("took %lld ms\n", ktime_to_ms(ktime_delta(ktime_get(), p->start)));
+	printk("compressed size: %d down from %d, factor %g\n",
+			p->compressed.llcsize,
+			p->source.nelements * sizeof(unsigned int),
+			(double) p->source.nelements * sizeof(unsigned int) /
+			p->compressed.llcsize);
+	}
+exit:
+	/* clean up our data buffers */
+	if (p) {
+		kfree(p->source.data);
+		kfree(p->swap.data);
+		kfree(p->compressed.data);
+		kfree(p);
+	}
+	pt_destroy(t);
+	return PN_TASK_SUCCESS;
+}
+int op_preproc_nlc(unsigned long op_code, struct proc_task *t)
+{
+	ssize_t n;
+	struct ProcData *p;
+	n = pt_get_nmemb(t);
+	if (!n)
+		return PN_TASK_SUCCESS;
+	p = (struct ProcData *) pt_get_data(t);
+	if (!p)	/* we have elements but data is NULL, error*/
+		return PN_TASK_DESTROY;
+	NlcSplineCorr28 ((unsigned int *)(p->source.data), p->source.nelements);
+	return PN_TASK_SUCCESS;
+}
+int op_decorr_diff(unsigned long op_code, struct proc_task *t)
+{
+	ssize_t n;
+	struct ProcData *p;
+	n = pt_get_nmemb(t);
+	if (!n)
+		return PN_TASK_SUCCESS;
+	p = (struct ProcData *) pt_get_data(t);
+	if (!p)	/* we have elements but data is NULL, error*/
+		return PN_TASK_DESTROY;
+	Delta32(  (int *) p->source.data, p->source.nelements);
+	Map2Pos32((int *) p->source.data, p->source.nelements);
+	return PN_TASK_SUCCESS;
+}
+int op_lossy3_round2(unsigned long op_code, struct proc_task *t)
+{
+	ssize_t n;
+	struct ProcData *p;
+	n = pt_get_nmemb(t);
+	if (!n)
+		return PN_TASK_SUCCESS;
+	p = (struct ProcData *) pt_get_data(t);
+	if (!p)	/* we have elements but data is NULL, error*/
+		return PN_TASK_DESTROY;
+	BitRounding32u((unsigned int *)(p->source.data), 2, p->source.nelements);
+	return PN_TASK_SUCCESS;
+}
+int op_llc_ari1(unsigned long op_code, struct proc_task *t)
+{
+	ssize_t n;
+	struct ProcData *p;
+	n = pt_get_nmemb(t);
+	if (!n)
+		return PN_TASK_SUCCESS;
+	p = (struct ProcData *) pt_get_data(t);
+	if (!p)	/* we have elements but data is NULL, error*/
+		return PN_TASK_DESTROY;
+	p->compressed.llcsize = fmari_compress((unsigned int *)(p->source.data),
+					       p->source.nelements,
+					       (unsigned int *)p->compressed.data,
+					       (unsigned int *)p->swap.data, 0);
+	p->compressed.llcsize *= 4; /* ARI counts words, we want bytes */
+	return PN_TASK_SUCCESS;
+}
+int pn_prepare_nodes(struct proc_net *pn)
+{
+	struct proc_tracker *pt;
+	/* create and add processing node trackers for the each operation */
+	pt = pt_track_create(op_preproc_nlc, OP_PREPROC_NLC, CRIT_LEVEL);
+	BUG_ON(!pt);
+	BUG_ON(pn_add_node(pn, pt));
+	pt = pt_track_create(op_decorr_diff, OP_DECORR_DIFF, CRIT_LEVEL);
+	BUG_ON(!pt);
+	BUG_ON(pn_add_node(pn, pt));
+	pt = pt_track_create(op_lossy3_round2, OP_LOSSY3_ROUND2, CRIT_LEVEL);
+	BUG_ON(!pt);
+	BUG_ON(pn_add_node(pn, pt));
+	pt = pt_track_create(op_llc_ari1, OP_LLC_ARI1, CRIT_LEVEL);
+	BUG_ON(!pt);
+	BUG_ON(pn_add_node(pn, pt));
+	BUG_ON(pn_create_output_node(pn, op_output));
+	return 0;
+}
+void pn_new_input_task(struct proc_net *pn)
+{
+	struct proc_task *t;
+	struct ProcData *p;
+	static int seq;
+	int i;
+	int n_elem;
+	p = kmalloc(sizeof(struct ProcData));
+	BUG_ON(!p);
+	t = pt_create(p, sizeof(struct ProcData), 10, 0, seq++);
+	pt_set_nmemb(t, 1);	/* 1 element */
+	BUG_ON(!t);
+	BUG_ON(pt_add_step(t, OP_PREPROC_NLC,   NULL));
+	BUG_ON(pt_add_step(t, OP_LOSSY3_ROUND2, NULL));
+	BUG_ON(pt_add_step(t, OP_DECORR_DIFF,   NULL));
+	BUG_ON(pt_add_step(t, OP_LLC_ARI1,      NULL));
+	/* allocate buffers */
+	n_elem = SRC_BUF_ELEM;
+	p->source.data      = kmalloc(n_elem * sizeof(unsigned int));
+	p->source.nelements = n_elem;
+	BUG_ON(!p->source.data);
+	n_elem = SRC_BUF_ELEM +  256 + 257; /* for ARI */
+	p->swap.data      = kmalloc(n_elem * sizeof(unsigned int));
+	p->swap.nelements = n_elem;
+	BUG_ON(!p->swap.data);
+	n_elem = COMPR_BUF_ELEM;
+	p->compressed.data = kmalloc(COMPR_BUF_ELEM * sizeof(unsigned int));
+	p->compressed.nelements = n_elem;
+	BUG_ON(!p->compressed.data);
+	for (i = 0; i < SRC_BUF_ELEM; i++)
+		((unsigned int *) p->source.data)[i] = i;
+	p->start = ktime_get();
+	pn_input_task(pn, t);
+}
+int demo(void *data)
+{
+	int i;
+	int *go;
+	struct proc_net *pn;
+	pn = pn_create();
+	BUG_ON(!pn);
+	pn_prepare_nodes(pn);
+	go = (int *) data;
+	(*go) = 1;	/* signal ready */
+	/* wait for trigger */
+	while (ioread32be(go) != CONFIG_SMP_CPUS_MAX);
+	/* execute test 5 times */
+	for (i = 0; i < 25; i++) {
+		pn_new_input_task(pn);
+		pn_process_inputs(pn);
+		while (pn_process_next(pn));
+		pn_process_outputs(pn);
+	}
+	(*go)--;
+	return 0;
+}
+void demo_start(void)
+{
+	int i;
+	int go;
+	struct task_struct *t;
+	printk("PROC NET DEMO STARTING\n");
+	printk("Creating tasks, please stand by\n");
+	for (i = 0; i < CONFIG_SMP_CPUS_MAX; i++) {
+		fm_ari_high[i] = 0xffff; /* init ARI */
+		go = 0;
+		t = kthread_create(demo, &go, i, "DEMO");
+		if (!IS_ERR(t)) {
+			/* allocate 95% of the cpu, period = 100 ms */
+			kthread_set_sched_edf(t, 100 * 1000, 98 * 1000, 95 * 1000);
+			if (kthread_wake_up(t) < 0) {
+				printk("---- %s NOT SCHEDUL-ABLE---\n", t->name);
+				BUG();
+			}
+			while (!ioread32be(&go)); /* wait for task to become ready */
+		} else {
+			printk("Got an error in kthread_create!");
+			break;
+		}
+		printk("Task ready on cpu %d\n", i);
+	}
+	printk("Triggering...\n");
+	go = CONFIG_SMP_CPUS_MAX; /* set trigger */
+	sched_yield();
+	while (ioread32be(&go)); /* wait for completion */
+	printk("PROC NET DEMO DONE\n");
+}
--- a/init/main.c
+++ b/init/main.c
@@ -49,6 +49,11 @@
 /** XXX dummy **/
 extern int cpu_ready[CONFIG_SMP_CPUS_MAX];
+void demo_start(void);
+int copybench_start(void);
+int oneshotedf_start(void);
 /**
 * @brief kernel initialisation routines
 */
@@ -76,6 +81,8 @@ int kernel_main(void)
 	struct elf_module m __attribute__((unused));
+#ifdef CONFIG_EMBED_MODULES_IMAGE
 	printk(MSG "Loading module image\n");
 	/* load the embedded AR image */
@@ -106,6 +113,7 @@ int kernel_main(void)
 	modules_list_loaded();
 #endif
+#endif
 #ifdef CONFIG_MPPB
@@ -139,7 +147,7 @@ int kernel_main(void)
 	}
-	printk(MSG "Boot complete\n");
+//	printk(MSG "Boot complete\n");
 #ifdef CONFIG_EMBED_APPLICATION
 	/* dummy demonstrator */
@@ -155,6 +163,13 @@ int kernel_main(void)
 #endif
 #endif
+#if 0
+	copybench_start();
+#else
+	//demo_start();
+	oneshotedf_start();
+#endif
 	while(1)
 		cpu_relax();

--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -51,14 +51,14 @@ static void kthread_unlock(void)
 void kthread_set_sched_edf(struct task_struct *task, unsigned long period_us,
-			   unsigned long wcet_us, unsigned long deadline_rel_us)
+			   unsigned long deadline_rel_us, unsigned long wcet_us)
 {
 	struct sched_attr attr;
 	sched_get_attr(task, &attr);
 	attr.policy = SCHED_EDF;
 	attr.period       = us_to_ktime(period_us);
-	attr.deadline_rel = us_to_ktime(wcet_us);
+	attr.deadline_rel = us_to_ktime(deadline_rel_us);
-	attr.wcet         = us_to_ktime(deadline_rel_us);
+	attr.wcet         = us_to_ktime(wcet_us);
 	sched_set_attr(task, &attr);
 }
@@ -70,6 +70,7 @@ void kthread_set_sched_edf(struct task_struct *task, unsigned long period_us,
 void kthread_free(struct task_struct *task)
 {
+	return;
 	if (task->flags & TASK_NO_CLEAN) /* delete from list as well */
 		return;
@@ -107,7 +108,9 @@ int kthread_wake_up(struct task_struct *task)
 	kthread_lock();
 	now = ktime_get();
-	sched_wake(task, ktime_get());
+	sched_wake(task, now);
+	task->wakeup_first = now;
 	/* this may be a critical task, send reschedule */
 	if (task->on_cpu != KTHREAD_CPU_AFFINITY_NONE)
@@ -213,6 +216,7 @@ static struct task_struct *kthread_create_internal(int (*thread_fn)(void *data),
 		return NULL;
 	}
+	task->create = ktime_get();
 	task->total  = 0;
 	task->slices = 0;
 	task->on_cpu = cpu;

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -41,6 +41,7 @@ static void sched_update_runtime(struct task_struct *task, ktime now)
 	rt = ktime_sub(now, task->exec_start);
+	task->exec_stop = now;
 	task->runtime = ktime_sub(task->runtime, rt);
 	task->total   = ktime_add(task->total, rt);
@@ -348,7 +349,7 @@ int sched_get_attr(struct task_struct *task, struct sched_attr *attr)
 int sched_set_policy_default(struct task_struct *task)
 {
 	struct sched_attr attr = {.policy   = SCHED_RR,
-				  .priority = 1};
+				  .priority = 100};
 	return sched_set_attr(task, &attr);