Skip to content
Snippets Groups Projects
Commit f54a35df authored by Armin Luntzer's avatar Armin Luntzer
Browse files

add rampfit demo kernel and adapt xentium demo code

parent eacdcbad
Branches
No related tags found
No related merge requests found
...@@ -33,7 +33,11 @@ HOSTLDFLAGS += -Tdsp/xentium/sysroot/lib/default.ld ...@@ -33,7 +33,11 @@ HOSTLDFLAGS += -Tdsp/xentium/sysroot/lib/default.ld
HOSTLDFLAGS += --sysroot=dsp/xentium/sysroot HOSTLDFLAGS += --sysroot=dsp/xentium/sysroot
xen_libs: xen_libs:
xen_libs-objs := xen_printf.o data_proc_task.o lib/xen.o lib/dma.o xen_libs-objs := lib/xen_printf.o
xen_libs-objs += lib/xen.o
xen_libs-objs += lib/dma.o
xen_libs-objs += lib/kmem.o
xen_libs-objs += ../../lib/data_proc_task.o
xen_dummy.xen : xen_libs xen_dummy.xen : xen_libs
HOSTLOADLIBES_xen_dummy.xen := HOSTLOADLIBES_xen_dummy.xen :=
...@@ -47,6 +51,14 @@ otherkernel.xen-objs := otherkernel.o $(xen_libs-objs) ...@@ -47,6 +51,14 @@ otherkernel.xen-objs := otherkernel.o $(xen_libs-objs)
hostprogs-y += otherkernel.xen hostprogs-y += otherkernel.xen
# XXX using the .S file works, but make throws a message; needs to address that at some point
xen_rampfit.xen : otherkernel.xen xen_libs
HOSTLOADLIBES_xen_rampfit.xen := -Ttext $$(readelf -s dsp/xentium/otherkernel.xen|grep -w _end |awk '{print $$2}')
xen_rampfit.xen-objs := kernel/rampfit/xen_rampfit.o $(xen_libs-objs) kernel/rampfit/xen_asm_rampfit.S
hostprogs-y += xen_rampfit.xen
always := $(hostprogs-y) always := $(hostprogs-y)
; description: rampfit assembler function for the NGAPP project
; authors: A. Luntzer
; version: 0.2
; date: 24.10.2013
; history: -
; note: this is the .75 cycles/sample variant
;
;
; equivalent C implementation (author: R. Ottensamer)
;int FastIntFixedRampFitBufferC (volatile long *data,
; unsigned int n_samples,
; unsigned int ramplen,
; long *slopes)
;
;{
; int i = 0;
; int r = 0;
; int ampl = ramplen;
; int SyTerm = 0;
;
; int pos = 0; /* temporary offset sorage */
; int value = 0; /* temporary sample storage */
;
; int Sy = 0;
; int Sxy = 0;
;
; for (pos = 0; pos < (n_samples-ramplen+1); )
; {
; Sy = 0;
; Sxy = 0;
;
; for (i=1; i <= ramplen; i++) /* equation starts with 1 */
; {
; value = data[pos++];
; Sy += value;
; Sxy += i * value;
; printk ("Sxy: %d value %d\n", Sxy, value);
; }
;
; SyTerm = ampl*((ramplen+1) * Sy) >> 1;
; slopes[r++] = (ampl*Sxy - iSyTerm);
; /* denomination has to be done outside */
; }
;
; return r;
;}
;; fast ramp fit
;; int FastIntFixedRampFitBuffer (int *piBANK0, int *piBANK2, unsigned int number_of_samples, unsigned int ramp_length, int *iSlopes);
.globl FastIntFixedRampFitBuffer
.align 4
.type FastIntFixedRampFitBuffer,@function
;; function call arguments
%define RFdatabuf1 RA6
%define RFdatabuf2 RB6
%define num_samples RC6
%define ramp_len RD6
%define RFslopebuf RE6
;; reserved registers
%define RFretptr RA7
%define RFretval RA6
%define cond0 RC0
%define cond1 RB0
;; constants and local variables
%define RFdata1 RA5
%define RFdata2 RC2
%define rl1 RC5
%define Sy RE2
%define Sxy RD5
%define r RA6
%define RFiloop RE1
%define tmp RA4
%define RFoloop RD4
%define i1 RA2
%define i2 RB2
%define i3 RC3
%define i4 RD3
%define RFamp RE3
FastIntFixedRampFitBuffer:
;; block 1
A0 ADD ramp_len, 1 ; calculate ramplen +1 = rl1
A1 OR 0, RFdatabuf1 ; RFdata1
S1 OR 0, RFdatabuf2 ; RFdata2
C0 LINK
S0 SL num_samples, 1 ; RFns * 4 / 2 = offset to end of buffer in words, split into two banks
cond0 = 0
r = 0
;; block 2
S0 ADD A1X, S0X ; end of buffer
S1 SRU ramp_len, 2 ; RFiloop = ramp_len/4
rl1 = A0X
RFdata1 = A1X
RFdata2 = S1X
RFretptr = C0X
;; block 3
RFiloop = S1X
tmp = S0X
.RFloop:
;; block 4
C0 LOOP 3, 8, RFiloop
M0 MUL ramp_len, rl1 ; RFamp - compute every time, saves 1 cycle during lead in plus unit is free anyway
A0 SUB tmp, 8
Sy = 0
;; block 5; loop delay slot 1
A0 SUB 0, 3 ; i1 prepare indices with negative offset
A1 SUB 0, 2 ; i2 before entering the loop, they are
C0 SUB 0, 1 ; i3 incremented to 1,2,3,4 on the first pass
RFoloop = A0X
;; block 6; loop delay slot 2
S1 OR 0, 0 ; prep initial Sxy (need that to properly fold the loop)
RFamp = M0X
i1 = A0X
i2 = A1X
i3 = C0X
i4 = 0
;; block 7; loop body block 1
A0 ADD RFdata1, 8 ; forwared 2 samples (words)
A1 ADD i1, 4 ; increment loop indices
C0 ADD i2, 4
E0 LD2 RFdata1 ; load from current
E1 LD2 RFdata2
;; block 8; loop body block 2
A0 ADD RFdata2, 8 ; forward 2 samples
A1 ADD i3, 4 ; increment loop indices
C0 ADD i4, 4
i1 = A1X ; update loop indices
i2 = C0X
RFdata1 = A0X ; updated buffer1 pointer
;; block 9; loop body block 3
A0 ADD E0X, E0Y ; sum first pair
A1 ADD E1X, E1Y ; and second pair
M0 MUL E0X, i1 ; multiply first two samples with
M1 MUL E1X, i2 ; loop index
C0 CMPGT RFdata1, RFoloop ; check if we reached end of buffer
i3 = A1X ; update loop indices
i4 = C0X
RFdata2 = A0X ; updated buffer2 pointer
;; block 10; loop body block 4
P0 ADD A0X, A1X ; final sum of all samples
M0 MUL E0Y, i3 ; multiply 3rd and 4th sample with loop
M1 MUL E1Y, i4 ; index
cond0 = C0X ; update result of condition
;; block 11; loop body block 5
S0 ADD Sy, P0X ; add sum of 4 samples to Sy
P0 ADD M0X, M1X ; add result of first two samples * loop idx
;; block 12; loop body block 6
S0 ADD M0X, M1X ; add result of 3rd and 4th sample * loop idx
Sy = S0X ; assign new Sy
;; block 13; loop body block 7
S0 ADD P0X, S0X ; final sum of idx * sample multiplications
Sxy = S1X ; update Sxy from previous cycle
;; block 14; loop body block 8
S1 ADD Sxy, S0X ; add new idx * sample to Sxy
;; block 15 - back in outer loop (the inner loop ends automatically)
M0 MUL Sy, RFamp ; mulitply Sy with amplifier
M1 MUL S1X, ramp_len ; use latest Sxy and multiply with ramplen (iAmplify)
;; block 16 - wait
A1 ADD r, 1 ; increment number of ramps processed
;; block 17
C0 BRZ, cond0 .RFloop ; init branch to start of outer loop
S0 SRU M0X, 1 ; divide iSyTerm
;; block 18
A0 SUB M1X, S0X ; calculate slope
;; block 19 - BR delay slot 1
E0 STW RFslopebuf[r], A0X ; A0X from block 18
r = A1X ; update r (in loop) ; RFretval == r!
;; block 20 - BR delay slot 2
C0 BRA RFretptr ; init branch back to caller
;; block 21+22
NOP 2 ; final delay slots
;; Elvis has left the building
.E_FastIntFixedRampFitBuffer:
.size FastIntFixedRampFitBuffer, .E_FastIntFixedRampFitBuffer-FastIntFixedRampFitBuffer
/**
* NOTE: This is for demonstration purposes only. There are lot of things that
* are not verified/handled/you name it, but the purpose of this kernel
* is to show off the what can be done and how with the resources available.
* This includes:
* - command exchange with the host processor
* - access to processing tasks
* - complex DMA transfer
* - kmalloc/kfree via the host processor
* - integration of custom (really seariously very superfast) assembly for
* the actual processing
*
* NOTE: the number of samples per ramp must be a multiple of 4
*/
#include <xen.h>
#include <dma.h>
#include <xen_printf.h>
#include <data_proc_net.h>
#include <kernel/kmem.h>
/* this kernel's properties */
#define KERN_NAME "rampfit"
#define KERN_STORAGE_BYTES 0
#define KERN_OP_CODE 0x00bada55
#define KERN_CRIT_TASK_LVL 25
struct xen_kernel_cfg _xen_kernel_param = {
KERN_NAME, KERN_OP_CODE,
KERN_CRIT_TASK_LVL,
NULL, KERN_STORAGE_BYTES,
};
/**
* see init/xentium_demo.c
*/
struct myopinfo {
unsigned int ramplen;
};
/* prototype of our assembly function */
int FastIntFixedRampFitBuffer(long *bank1, long *bank2,
unsigned int n_samples, unsigned int ramplen,
long *slopes);
/**
* here we do the work
*/
static void process_task(struct xen_msg_data *m)
{
size_t n;
size_t n_ramps;
long *p;
long *slopes;
volatile long *b1;
volatile long *b2;
volatile long *b3;
struct xen_tcm *tcm_ext;
struct myopinfo *op_info;
if (!m->t) {
m->cmd = TASK_DESTROY;
return;
}
/* These refers to the TCM banks. Note that at least b1 must be
* volatile, or clang soils itself because the local TCM starts at 0x0
* and (in this case incorrectly) detects that as a NULL pointer
* dereference.
*/
b1 = (volatile long *) xen_tcm_local->bank1;
b2 = (volatile long *) xen_tcm_local->bank2;
b3 = (volatile long *) xen_tcm_local->bank3;
/* determine our TCM's external address, so we can program DMA
* transfers correctly
*/
tcm_ext = xen_get_base_addr(m->xen_id);
op_info = (struct myopinfo *) pt_get_pend_step_op_info(m->t);
if (!op_info) {
m->cmd = TASK_DESTROY;
return;
}
/* number of elements in data buffer. */
n = pt_get_nmemb(m->t);
if (n & 0x3) {
printk("Warning: N is not a multiple of 4, adjusting.\n");
n &= ~0x3;
}
/* The buffer to store the slopes may be anywhere in memory. We only
* write to it every couple of cycles, no problem.
* (Don't do this if you want fast accesses/reads!)
*/
n_ramps = n / op_info->ramplen;
slopes = kzalloc(n_ramps * sizeof(long));
/* the data buffer of this task */
p = (unsigned long *) pt_get_data(m->t);
/* retrieve data to TCM
* XXX no retval handling
* NOTE: we support at most 8k 32 bit samples for one processing round
* where the number of samples are (rounded down) to a multiple of four,
* i.e. one can fit at most 2048 ramps of 4 samples per "task"
* (remember this was made for demonstrational purposes only)
*
*
* How this is done: the ramp fit assembly implementation we use here
* expects us to deliver the data in two parallel TCM banks, which
* should be banks 1 and 3, which in principle allows us to work on
* ramps of up to 8192 samples total, which must be stored in two bank
* groups sequentially but "de-interleaved", so there are no same-bank
* access conflicts of the Xentiums E units and we may load 4 samples at
* in every clock cycle:
*
* +----------v
* ⎮ 1 2 3 4
* ⎮ +-----+-----+-----+-----+
* ⎮ ⎮ 1 ⎮ 9 ⎮ 2 ⎮ 10 ⎮
* ⎮ ⎮ 3 ⎮ ... ⎮ 4 ⎮ ... ⎮
* ⎮ ⎮ ... ⎮ ... ⎮ ... ⎮ ... ⎮
* ⎮ ⎮ 7 ⎮ 21 ⎮ 8 ⎮ 22 ⎮
* ⎮ +-----+-----+-----+-----+
* +____v
*
*
*
* To set this up, we'll perform a DMA transfer to split the linear
* sequences of ramp data into the banks.
*
* We instruct the DMA to consider the data in a shape that is 2 columns
* by n/2 rows. Iterating over the source, it will take items from
* columns sequentially and place the into the target with an offset of
* 2 bank sizes apart (b3 - b1). It will then forward-skip two columns
* to reach the next row, but perform no skip at the target, because
* there the column shape is 1 column by n/2 rows.
*/
xen_noc_dma_req_xfer(m->dma, p, tcm_ext, 2, n/2, WORD,
1, (int16_t)(b3 - b1),
2, 1, LOW, 256);
/* process the ramps*/
FastIntFixedRampFitBuffer((long *) b1,
(long *) b3,
n, op_info->ramplen, slopes);
/* Now copy the resulting slopes to the data buffer and free the
* temporary allocation.
* We do this by performing two transfers, one from the slopes buffer to
* the TCM, then back to the data buffer of the task.
* We have to do this, because the DMA cannot perform transfers into the
* same NoC node (if you do, it gets stuck)
*/
xen_noc_dma_req_lin_xfer(m->dma, slopes, tcm_ext, n_ramps,
WORD, LOW, 256);
/* now into the task_'s data buffer */
xen_noc_dma_req_lin_xfer(m->dma, tcm_ext, p, n_ramps, WORD, LOW, 256);
/* update the member size of the buffer */
pt_set_nmemb(m->t, n_ramps);
/* free the slope buffer */
kfree(slopes);
/* and we're done */
m->cmd = TASK_SUCCESS;
}
/**
* the main function
*/
int main(void)
{
struct xen_msg_data *m;
while (1) {
m = xen_wait_cmd();
if (!m) {
printk("Invalid command location, bailing.");
return 0;
}
switch (m->cmd) {
case TASK_EXIT:
/* confirm abort */
xen_send_msg(m);
return 0;
default:
break;
}
process_task(m);
xen_send_msg(m);
}
return 0;
}
#include <data_proc_task.h>
/* the xentium must be able to interpret the proc task
* XXX this is just a hack, do it properly */
/**
* @brief get the number of elements in the data buffer of a processing task
*
* @param t a struct proc_task
*
* @return the number of elements in the buffer
*/
size_t pt_get_nmemb(struct proc_task *t)
{
return t->nmemb;
}
/**
* @brief get the data buffer in a processing task
*
* @param t a struct proc_task
*
* @return the pointer to a data buffer (may be NULL)
*/
void *pt_get_data(struct proc_task *t)
{
return t->data;
}
...@@ -16,6 +16,20 @@ ...@@ -16,6 +16,20 @@
#include <kernel/xentium.h> #include <kernel/xentium.h>
/**
* Some implementation dependent op info passed by whatever created the task
* this could also just exist in the <data> buffer as a interpretable structure.
* This is really up to the user...
* Note: the xentium kernel processing a task must know the same structure
*/
struct myopinfo {
unsigned int ramplen;
};
/** /**
* @brief the output function of the xentium processing network * @brief the output function of the xentium processing network
*/ */
...@@ -29,6 +43,9 @@ static int xen_op_output(unsigned long op_code, struct proc_task *t) ...@@ -29,6 +43,9 @@ static int xen_op_output(unsigned long op_code, struct proc_task *t)
/* need to address those caching issues at some point */
asm("flush");
n = pt_get_nmemb(t); n = pt_get_nmemb(t);
...@@ -42,7 +59,8 @@ static int xen_op_output(unsigned long op_code, struct proc_task *t) ...@@ -42,7 +59,8 @@ static int xen_op_output(unsigned long op_code, struct proc_task *t)
if (!p) if (!p)
goto exit; goto exit;
printk("XEN_OUT: \t%d\n", ioread32be(&p[n-1])); for (i = 0; i < n; i++)
printk("XEN_OUT: \t%d\n", ioread32be(&p[i]));
exit: exit:
...@@ -73,7 +91,11 @@ static void xen_new_input_task(size_t n) ...@@ -73,7 +91,11 @@ static void xen_new_input_task(size_t n)
int i; int i;
unsigned int *data; unsigned int *data;
struct myopinfo *nfo;
nfo = kzalloc(sizeof(struct myopinfo));
if (!nfo)
return;
data = kzalloc(sizeof(unsigned int) * n); data = kzalloc(sizeof(unsigned int) * n);
if (!data) if (!data)
...@@ -86,11 +108,8 @@ static void xen_new_input_task(size_t n) ...@@ -86,11 +108,8 @@ static void xen_new_input_task(size_t n)
BUG_ON(!t); BUG_ON(!t);
BUG_ON(pt_add_step(t, 0xdeadbeef, NULL)); nfo->ramplen = 16;
BUG_ON(pt_add_step(t, 0xb19b00b5, NULL)); BUG_ON(pt_add_step(t, 0x00bada55, nfo));
BUG_ON(pt_add_step(t, 0xdeadbeef, NULL));
BUG_ON(pt_add_step(t, 0xb19b00b5, NULL));
BUG_ON(pt_add_step(t, 0xb19b00b5, NULL));
while (xentium_input_task(t) < 0) while (xentium_input_task(t) < 0)
printk("Xenitium input busy!\n"); printk("Xenitium input busy!\n");
...@@ -110,11 +129,8 @@ void xen_demo(void) ...@@ -110,11 +129,8 @@ void xen_demo(void)
xentium_config_output_node(xen_op_output); xentium_config_output_node(xen_op_output);
xen_new_input_task(32);
while (1) { while (1) {
static int seq = 100;
if (seq < 120)
xen_new_input_task(seq++);
xentium_output_tasks(); xentium_output_tasks();
} }
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment