diff --git a/dsp/xentium/Makefile b/dsp/xentium/Makefile index e63613811b9b9b52f71daa86a8232af5aa2b36b1..d37b8001f38005ae326464e817c26723e3397f5b 100644 --- a/dsp/xentium/Makefile +++ b/dsp/xentium/Makefile @@ -33,7 +33,11 @@ HOSTLDFLAGS += -Tdsp/xentium/sysroot/lib/default.ld HOSTLDFLAGS += --sysroot=dsp/xentium/sysroot xen_libs: -xen_libs-objs := xen_printf.o data_proc_task.o lib/xen.o lib/dma.o +xen_libs-objs := lib/xen_printf.o +xen_libs-objs += lib/xen.o +xen_libs-objs += lib/dma.o +xen_libs-objs += lib/kmem.o +xen_libs-objs += ../../lib/data_proc_task.o xen_dummy.xen : xen_libs HOSTLOADLIBES_xen_dummy.xen := @@ -41,12 +45,20 @@ xen_dummy.xen-objs := xen_dummy.o $(xen_libs-objs) hostprogs-y := xen_dummy.xen -otherkernel.xen : xen_dummy.xen xen_libs +otherkernel.xen : xen_dummy.xen xen_libs HOSTLOADLIBES_otherkernel.xen := -Ttext $$(readelf -s dsp/xentium/xen_dummy.xen|grep -w _end |awk '{print $$2}') otherkernel.xen-objs := otherkernel.o $(xen_libs-objs) hostprogs-y += otherkernel.xen +# XXX using the .S file works, but make throws a message; needs to address that at some point +xen_rampfit.xen : otherkernel.xen xen_libs +HOSTLOADLIBES_xen_rampfit.xen := -Ttext $$(readelf -s dsp/xentium/otherkernel.xen|grep -w _end |awk '{print $$2}') +xen_rampfit.xen-objs := kernel/rampfit/xen_rampfit.o $(xen_libs-objs) kernel/rampfit/xen_asm_rampfit.S +hostprogs-y += xen_rampfit.xen + + + always := $(hostprogs-y) diff --git a/dsp/xentium/kernel/rampfit/xen_asm_rampfit.S b/dsp/xentium/kernel/rampfit/xen_asm_rampfit.S new file mode 100644 index 0000000000000000000000000000000000000000..d14e0f6282a1616c8b2ef9947afc78c259d32baa --- /dev/null +++ b/dsp/xentium/kernel/rampfit/xen_asm_rampfit.S @@ -0,0 +1,204 @@ +; description: rampfit assembler function for the NGAPP project +; authors: A. Luntzer +; version: 0.2 +; date: 24.10.2013 +; history: - +; note: this is the .75 cycles/sample variant +; +; +; equivalent C implementation (author: R. Ottensamer) +;int FastIntFixedRampFitBufferC (volatile long *data, +; unsigned int n_samples, +; unsigned int ramplen, +; long *slopes) +; +;{ +; int i = 0; +; int r = 0; +; int ampl = ramplen; +; int SyTerm = 0; +; +; int pos = 0; /* temporary offset sorage */ +; int value = 0; /* temporary sample storage */ +; +; int Sy = 0; +; int Sxy = 0; +; +; for (pos = 0; pos < (n_samples-ramplen+1); ) +; { +; Sy = 0; +; Sxy = 0; +; +; for (i=1; i <= ramplen; i++) /* equation starts with 1 */ +; { +; value = data[pos++]; +; Sy += value; +; Sxy += i * value; +; printk ("Sxy: %d value %d\n", Sxy, value); +; } +; +; SyTerm = ampl*((ramplen+1) * Sy) >> 1; +; slopes[r++] = (ampl*Sxy - iSyTerm); +; /* denomination has to be done outside */ +; } +; +; return r; +;} + +;; fast ramp fit +;; int FastIntFixedRampFitBuffer (int *piBANK0, int *piBANK2, unsigned int number_of_samples, unsigned int ramp_length, int *iSlopes); +.globl FastIntFixedRampFitBuffer +.align 4 +.type FastIntFixedRampFitBuffer,@function + +;; function call arguments +%define RFdatabuf1 RA6 +%define RFdatabuf2 RB6 +%define num_samples RC6 +%define ramp_len RD6 +%define RFslopebuf RE6 +;; reserved registers +%define RFretptr RA7 +%define RFretval RA6 +%define cond0 RC0 +%define cond1 RB0 + +;; constants and local variables +%define RFdata1 RA5 +%define RFdata2 RC2 +%define rl1 RC5 +%define Sy RE2 +%define Sxy RD5 +%define r RA6 +%define RFiloop RE1 +%define tmp RA4 + +%define RFoloop RD4 + +%define i1 RA2 +%define i2 RB2 + +%define i3 RC3 +%define i4 RD3 +%define RFamp RE3 + +FastIntFixedRampFitBuffer: + ;; block 1 + A0 ADD ramp_len, 1 ; calculate ramplen +1 = rl1 + A1 OR 0, RFdatabuf1 ; RFdata1 + S1 OR 0, RFdatabuf2 ; RFdata2 + C0 LINK + S0 SL num_samples, 1 ; RFns * 4 / 2 = offset to end of buffer in words, split into two banks + cond0 = 0 + r = 0 + + ;; block 2 + S0 ADD A1X, S0X ; end of buffer + S1 SRU ramp_len, 2 ; RFiloop = ramp_len/4 + rl1 = A0X + RFdata1 = A1X + RFdata2 = S1X + RFretptr = C0X + + ;; block 3 + RFiloop = S1X + tmp = S0X + +.RFloop: + ;; block 4 + C0 LOOP 3, 8, RFiloop + M0 MUL ramp_len, rl1 ; RFamp - compute every time, saves 1 cycle during lead in plus unit is free anyway + A0 SUB tmp, 8 + Sy = 0 + + ;; block 5; loop delay slot 1 + A0 SUB 0, 3 ; i1 prepare indices with negative offset + A1 SUB 0, 2 ; i2 before entering the loop, they are + C0 SUB 0, 1 ; i3 incremented to 1,2,3,4 on the first pass + RFoloop = A0X + + + ;; block 6; loop delay slot 2 + S1 OR 0, 0 ; prep initial Sxy (need that to properly fold the loop) + RFamp = M0X + i1 = A0X + i2 = A1X + i3 = C0X + i4 = 0 + + ;; block 7; loop body block 1 + A0 ADD RFdata1, 8 ; forwared 2 samples (words) + A1 ADD i1, 4 ; increment loop indices + C0 ADD i2, 4 + E0 LD2 RFdata1 ; load from current + E1 LD2 RFdata2 + + ;; block 8; loop body block 2 + A0 ADD RFdata2, 8 ; forward 2 samples + A1 ADD i3, 4 ; increment loop indices + C0 ADD i4, 4 + i1 = A1X ; update loop indices + i2 = C0X + RFdata1 = A0X ; updated buffer1 pointer + + ;; block 9; loop body block 3 + A0 ADD E0X, E0Y ; sum first pair + A1 ADD E1X, E1Y ; and second pair + M0 MUL E0X, i1 ; multiply first two samples with + M1 MUL E1X, i2 ; loop index + C0 CMPGT RFdata1, RFoloop ; check if we reached end of buffer + i3 = A1X ; update loop indices + i4 = C0X + RFdata2 = A0X ; updated buffer2 pointer + + ;; block 10; loop body block 4 + P0 ADD A0X, A1X ; final sum of all samples + M0 MUL E0Y, i3 ; multiply 3rd and 4th sample with loop + M1 MUL E1Y, i4 ; index + cond0 = C0X ; update result of condition + + ;; block 11; loop body block 5 + S0 ADD Sy, P0X ; add sum of 4 samples to Sy + P0 ADD M0X, M1X ; add result of first two samples * loop idx + + ;; block 12; loop body block 6 + S0 ADD M0X, M1X ; add result of 3rd and 4th sample * loop idx + Sy = S0X ; assign new Sy + + ;; block 13; loop body block 7 + S0 ADD P0X, S0X ; final sum of idx * sample multiplications + Sxy = S1X ; update Sxy from previous cycle + + ;; block 14; loop body block 8 + S1 ADD Sxy, S0X ; add new idx * sample to Sxy + + ;; block 15 - back in outer loop (the inner loop ends automatically) + M0 MUL Sy, RFamp ; mulitply Sy with amplifier + M1 MUL S1X, ramp_len ; use latest Sxy and multiply with ramplen (iAmplify) + + ;; block 16 - wait + A1 ADD r, 1 ; increment number of ramps processed + + ;; block 17 + C0 BRZ, cond0 .RFloop ; init branch to start of outer loop + S0 SRU M0X, 1 ; divide iSyTerm + + ;; block 18 + A0 SUB M1X, S0X ; calculate slope + + ;; block 19 - BR delay slot 1 + E0 STW RFslopebuf[r], A0X ; A0X from block 18 + r = A1X ; update r (in loop) ; RFretval == r! + + ;; block 20 - BR delay slot 2 + C0 BRA RFretptr ; init branch back to caller + + ;; block 21+22 + NOP 2 ; final delay slots + + ;; Elvis has left the building + + + +.E_FastIntFixedRampFitBuffer: +.size FastIntFixedRampFitBuffer, .E_FastIntFixedRampFitBuffer-FastIntFixedRampFitBuffer diff --git a/dsp/xentium/kernel/rampfit/xen_rampfit.c b/dsp/xentium/kernel/rampfit/xen_rampfit.c new file mode 100644 index 0000000000000000000000000000000000000000..5e922c22a8bf81af8d79722fb30143f9bb102879 --- /dev/null +++ b/dsp/xentium/kernel/rampfit/xen_rampfit.c @@ -0,0 +1,232 @@ +/** + * NOTE: This is for demonstration purposes only. There are lot of things that + * are not verified/handled/you name it, but the purpose of this kernel + * is to show off the what can be done and how with the resources available. + * This includes: + * - command exchange with the host processor + * - access to processing tasks + * - complex DMA transfer + * - kmalloc/kfree via the host processor + * - integration of custom (really seariously very superfast) assembly for + * the actual processing + * + * NOTE: the number of samples per ramp must be a multiple of 4 + */ + + +#include <xen.h> +#include <dma.h> +#include <xen_printf.h> +#include <data_proc_net.h> +#include <kernel/kmem.h> + + + + +/* this kernel's properties */ + +#define KERN_NAME "rampfit" +#define KERN_STORAGE_BYTES 0 +#define KERN_OP_CODE 0x00bada55 +#define KERN_CRIT_TASK_LVL 25 + +struct xen_kernel_cfg _xen_kernel_param = { + KERN_NAME, KERN_OP_CODE, + KERN_CRIT_TASK_LVL, + NULL, KERN_STORAGE_BYTES, +}; + + +/** + * see init/xentium_demo.c + */ + +struct myopinfo { + unsigned int ramplen; +}; + + +/* prototype of our assembly function */ +int FastIntFixedRampFitBuffer(long *bank1, long *bank2, + unsigned int n_samples, unsigned int ramplen, + long *slopes); + +/** + * here we do the work + */ + +static void process_task(struct xen_msg_data *m) +{ + size_t n; + size_t n_ramps; + + long *p; + long *slopes; + + volatile long *b1; + volatile long *b2; + volatile long *b3; + + struct xen_tcm *tcm_ext; + + struct myopinfo *op_info; + + + + if (!m->t) { + m->cmd = TASK_DESTROY; + return; + } + + /* These refers to the TCM banks. Note that at least b1 must be + * volatile, or clang soils itself because the local TCM starts at 0x0 + * and (in this case incorrectly) detects that as a NULL pointer + * dereference. + */ + + b1 = (volatile long *) xen_tcm_local->bank1; + b2 = (volatile long *) xen_tcm_local->bank2; + b3 = (volatile long *) xen_tcm_local->bank3; + + + /* determine our TCM's external address, so we can program DMA + * transfers correctly + */ + tcm_ext = xen_get_base_addr(m->xen_id); + + + + + + op_info = (struct myopinfo *) pt_get_pend_step_op_info(m->t); + if (!op_info) { + m->cmd = TASK_DESTROY; + return; + } + + /* number of elements in data buffer. */ + n = pt_get_nmemb(m->t); + + if (n & 0x3) { + printk("Warning: N is not a multiple of 4, adjusting.\n"); + n &= ~0x3; + } + + + /* The buffer to store the slopes may be anywhere in memory. We only + * write to it every couple of cycles, no problem. + * (Don't do this if you want fast accesses/reads!) + */ + n_ramps = n / op_info->ramplen; + slopes = kzalloc(n_ramps * sizeof(long)); + + + /* the data buffer of this task */ + p = (unsigned long *) pt_get_data(m->t); + + /* retrieve data to TCM + * XXX no retval handling + * NOTE: we support at most 8k 32 bit samples for one processing round + * where the number of samples are (rounded down) to a multiple of four, + * i.e. one can fit at most 2048 ramps of 4 samples per "task" + * (remember this was made for demonstrational purposes only) + * + * + * How this is done: the ramp fit assembly implementation we use here + * expects us to deliver the data in two parallel TCM banks, which + * should be banks 1 and 3, which in principle allows us to work on + * ramps of up to 8192 samples total, which must be stored in two bank + * groups sequentially but "de-interleaved", so there are no same-bank + * access conflicts of the Xentiums E units and we may load 4 samples at + * in every clock cycle: + * + * +----------v + * ⎮ 1 2 3 4 + * ⎮ +-----+-----+-----+-----+ + * ⎮ ⎮ 1 ⎮ 9 ⎮ 2 ⎮ 10 ⎮ + * ⎮ ⎮ 3 ⎮ ... ⎮ 4 ⎮ ... ⎮ + * ⎮ ⎮ ... ⎮ ... ⎮ ... ⎮ ... ⎮ + * ⎮ ⎮ 7 ⎮ 21 ⎮ 8 ⎮ 22 ⎮ + * ⎮ +-----+-----+-----+-----+ + * +____v + * + * + * + * To set this up, we'll perform a DMA transfer to split the linear + * sequences of ramp data into the banks. + * + * We instruct the DMA to consider the data in a shape that is 2 columns + * by n/2 rows. Iterating over the source, it will take items from + * columns sequentially and place the into the target with an offset of + * 2 bank sizes apart (b3 - b1). It will then forward-skip two columns + * to reach the next row, but perform no skip at the target, because + * there the column shape is 1 column by n/2 rows. + */ + + xen_noc_dma_req_xfer(m->dma, p, tcm_ext, 2, n/2, WORD, + 1, (int16_t)(b3 - b1), + 2, 1, LOW, 256); + + /* process the ramps*/ + FastIntFixedRampFitBuffer((long *) b1, + (long *) b3, + n, op_info->ramplen, slopes); + + /* Now copy the resulting slopes to the data buffer and free the + * temporary allocation. + * We do this by performing two transfers, one from the slopes buffer to + * the TCM, then back to the data buffer of the task. + * We have to do this, because the DMA cannot perform transfers into the + * same NoC node (if you do, it gets stuck) + */ + xen_noc_dma_req_lin_xfer(m->dma, slopes, tcm_ext, n_ramps, + WORD, LOW, 256); + + /* now into the task_'s data buffer */ + xen_noc_dma_req_lin_xfer(m->dma, tcm_ext, p, n_ramps, WORD, LOW, 256); + + /* update the member size of the buffer */ + pt_set_nmemb(m->t, n_ramps); + + /* free the slope buffer */ + kfree(slopes); + + /* and we're done */ + m->cmd = TASK_SUCCESS; +} + + +/** + * the main function + */ + +int main(void) +{ + struct xen_msg_data *m; + + while (1) { + m = xen_wait_cmd(); + + if (!m) { + printk("Invalid command location, bailing."); + return 0; + } + + + switch (m->cmd) { + case TASK_EXIT: + /* confirm abort */ + xen_send_msg(m); + return 0; + default: + break; + } + + process_task(m); + + xen_send_msg(m); + } + + + return 0; +} diff --git a/dsp/xentium/lib/data_proc_task.c b/dsp/xentium/lib/data_proc_task.c deleted file mode 100644 index bef39bf451c1e2c754c9aadc278950a07372345e..0000000000000000000000000000000000000000 --- a/dsp/xentium/lib/data_proc_task.c +++ /dev/null @@ -1,35 +0,0 @@ - -#include <data_proc_task.h> - - -/* the xentium must be able to interpret the proc task - * XXX this is just a hack, do it properly */ - - - - -/** - * @brief get the number of elements in the data buffer of a processing task - * - * @param t a struct proc_task - * - * @return the number of elements in the buffer - */ - -size_t pt_get_nmemb(struct proc_task *t) -{ - return t->nmemb; -} - -/** - * @brief get the data buffer in a processing task - * - * @param t a struct proc_task - * - * @return the pointer to a data buffer (may be NULL) - */ - -void *pt_get_data(struct proc_task *t) -{ - return t->data; -} diff --git a/dsp/xentium/lib/dma.c b/dsp/xentium/lib/dma.c index ed7d6041ed3488a8dbe5ff784026f1076c33cc78..1d61c387615187eeefee4e5f9db8c0c70157ee3f 100644 --- a/dsp/xentium/lib/dma.c +++ b/dsp/xentium/lib/dma.c @@ -199,7 +199,7 @@ static int noc_dma_start_transfer(struct noc_dma_channel *chan) return -EBUSY; iowrite32(NOC_DMA_CHANNEL_START, &chan->start); - + /* XXX remove once we figure out how to properly use the Xentium's * DMA status bits */ @@ -300,7 +300,7 @@ static int noc_dma_init_transfer(struct noc_dma_channel *c, * @param y_stride_dst the width of stride in destination y * * @param mtu the maximum transfer unit of a NoC packet - * + * * @returns <0 on error */ @@ -349,7 +349,7 @@ int xen_noc_dma_req_xfer(struct noc_dma_channel *c, t.priority = dma_priority; - + ret = noc_dma_init_transfer(c, &t); if (ret) return ret; diff --git a/dsp/xentium/lib/xen.c b/dsp/xentium/lib/xen.c index da658d628c3469b8d36ed593746b2d132c8907ef..3fd66f79296946c5502f7a9e33d220e14c88948e 100644 --- a/dsp/xentium/lib/xen.c +++ b/dsp/xentium/lib/xen.c @@ -27,7 +27,7 @@ void xen_set_mail(size_t mbox, unsigned long msg) { if (mbox >= XEN_MAILBOXES) return; - + xen_dev_local->mbox[mbox] = msg; } @@ -42,7 +42,7 @@ unsigned long xen_get_mail(size_t mbox) { if (mbox < XEN_MAILBOXES) return xen_dev_local->mbox[mbox]; - + return 0; } @@ -92,7 +92,7 @@ void xen_wait_timer(int timer, unsigned long cycles) xen_dev_local->timer[timer] = cycles; - + while(xen_dev_local->timer[timer]); } @@ -104,7 +104,7 @@ void xen_wait_timer(int timer, unsigned long cycles) void xen_wait_dma(void) { /* XXX this doesn't seem to work sensibly, we can't x_wait() on the - * dma irq status bit, because it is set UNTIL we clear the + * dma irq status bit, because it is set UNTIL we clear the * dma_irq, and we'd apparently have to also clear the latter * _before_ we even start the transfer. For now, it's easier to just * wait on the channel status bit in noc_dma_start_transfer() diff --git a/init/main.c b/init/main.c index 673005cfa5d693421046eae265128e1a2711a4e7..b0a089a2255ffcf52e89bca7af0da781cea6d8c8 100644 --- a/init/main.c +++ b/init/main.c @@ -59,7 +59,7 @@ int main(void) void *addr; struct elf_module m; - + printk(MSG "Loading module image\n"); /* load the embedded AR image */ @@ -89,7 +89,7 @@ int main(void) pr_debug(MSG "noc_dma module address is %p\n", addr); if (addr) module_load(&m, addr); - + modules_list_loaded(); #endif diff --git a/init/xentium_demo.c b/init/xentium_demo.c index 9fc0d69c3aadc3b77e3dd43c7f459a5eb913cd92..5aa02175b2c615df271290ab32aa0a8e50e2dce7 100644 --- a/init/xentium_demo.c +++ b/init/xentium_demo.c @@ -16,6 +16,20 @@ #include <kernel/xentium.h> + +/** + * Some implementation dependent op info passed by whatever created the task + * this could also just exist in the <data> buffer as a interpretable structure. + * This is really up to the user... + * Note: the xentium kernel processing a task must know the same structure + */ + +struct myopinfo { + unsigned int ramplen; +}; + + + /** * @brief the output function of the xentium processing network */ @@ -29,6 +43,9 @@ static int xen_op_output(unsigned long op_code, struct proc_task *t) + /* need to address those caching issues at some point */ + asm("flush"); + n = pt_get_nmemb(t); @@ -42,7 +59,8 @@ static int xen_op_output(unsigned long op_code, struct proc_task *t) if (!p) goto exit; - printk("XEN_OUT: \t%d\n", ioread32be(&p[n-1])); + for (i = 0; i < n; i++) + printk("XEN_OUT: \t%d\n", ioread32be(&p[i])); exit: @@ -73,7 +91,11 @@ static void xen_new_input_task(size_t n) int i; unsigned int *data; + struct myopinfo *nfo; + nfo = kzalloc(sizeof(struct myopinfo)); + if (!nfo) + return; data = kzalloc(sizeof(unsigned int) * n); if (!data) @@ -86,11 +108,8 @@ static void xen_new_input_task(size_t n) BUG_ON(!t); - BUG_ON(pt_add_step(t, 0xdeadbeef, NULL)); - BUG_ON(pt_add_step(t, 0xb19b00b5, NULL)); - BUG_ON(pt_add_step(t, 0xdeadbeef, NULL)); - BUG_ON(pt_add_step(t, 0xb19b00b5, NULL)); - BUG_ON(pt_add_step(t, 0xb19b00b5, NULL)); + nfo->ramplen = 16; + BUG_ON(pt_add_step(t, 0x00bada55, nfo)); while (xentium_input_task(t) < 0) printk("Xenitium input busy!\n"); @@ -110,12 +129,9 @@ void xen_demo(void) xentium_config_output_node(xen_op_output); + xen_new_input_task(32); while (1) { - static int seq = 100; - if (seq < 120) - xen_new_input_task(seq++); - xentium_output_tasks(); }