d3q19: initial upload (no Zou/He)
authorSebastian <git@sraa.de>
Mon, 1 Sep 2014 19:28:36 +0000 (19:28 +0000)
committerSebastian <git@sraa.de>
Mon, 1 Sep 2014 19:28:36 +0000 (19:28 +0000)
Will not compile with Zou/He boundaries due
to linker error. The code does not fit in Bank 0,
even with -Os.

d3q19/Makefile [new file with mode: 0644]
d3q19/esrc/d3q19.c [new file with mode: 0644]
d3q19/esrc/d3q19.h [new file with mode: 0644]
d3q19/esrc/main.c [new file with mode: 0644]
d3q19/hsrc/data.c [new file with mode: 0644]
d3q19/hsrc/main.c [new file with mode: 0644]
d3q19/shared.h [new file with mode: 0644]

diff --git a/d3q19/Makefile b/d3q19/Makefile
new file mode 100644 (file)
index 0000000..5674c3d
--- /dev/null
@@ -0,0 +1,105 @@
+# Template Makefile for Epiphany
+
+# host toolchain
+HCC    = gcc
+HCFLAGS        = -O2 -std=c99 -I$(EPIPHANY_HOME)/tools/host/include -Wall
+HLFLAGS        = -lm -L$(EPIPHANY_HOME)/tools/host/lib -le-hal
+ECHO   = /bin/echo -e
+
+# target toolchain
+ECC    = e-gcc
+EOC    = e-objcopy
+ECFLAGS        = -Os -std=c99 -falign-loops=8 -falign-functions=8 -Wall -fsingle-precision-constant -ffast-math
+ELFLAGS        = -T$(EPIPHANY_HOME)/bsps/current/internal.ldf -le-lib
+EOFLAGS        = -R .shared_dram -R .data_bank1 -R .data_bank2 -R .data_bank3
+
+# host application
+HAPP   = $(DEST)/ep_main
+HOBJS  = $(HDEST)/main.o $(HDEST)/data.o
+
+# epiphany applications
+EAPPS  = $(DEST)/main.srec
+ECOMMON        = $(EDEST)/d3q19.o
+
+# folders
+HSRC   = hsrc
+HDEST  = hobj
+ESRC   = esrc
+EDEST  = eobj
+DEST   = bin
+
+# === Magic begins here ===================================================
+EOBJS  = $(EAPPS:$(DEST)%srec=$(EDEST)%o) $(ECOMMON)
+EELFS  = $(EAPPS:$(DEST)%srec=$(EDEST)%elf)
+
+.SECONDARY:
+.PHONY: all help host target folders run clean
+.NOTPARALLEL: clean
+
+# === Phony Rules =========================================================
+help:
+       @$(ECHO)
+       @$(ECHO) "Epiphany Makefile - Help"
+       @$(ECHO) "  help    show this help"
+       @$(ECHO) "  host    build host application      ($(HAPP))"
+       @$(ECHO) "  target  build epiphany applications ($(EAPPS))"
+       @$(ECHO) "  all     build all"
+       @$(ECHO) "  run     build all, then run host application"
+       @$(ECHO) "  clean   remove applications and intermediate files"
+       @$(ECHO)
+
+all: host target
+
+host: folders $(HAPP)
+
+target: folders $(EAPPS)
+
+folders: $(HDEST) $(EDEST) $(DEST)
+
+run: host target
+       @$(ECHO) "\tRUN"
+       @sudo LD_LIBRARY_PATH=$(LD_LIBRARY_PATH) \
+             EPIPHANY_HDF=$(EPIPHANY_HDF) \
+             $(HAPP)
+
+clean:
+       @$(ECHO) "\tCLEAN"
+       @rm -v -f $(HAPP) $(HOBJS) $(EAPPS) $(EELFS) $(EOBJS)
+       @-rmdir -v --ignore-fail-on-non-empty $(HDEST) $(EDEST) $(DEST) \
+               2>/dev/null
+
+$(HDEST):
+       @$(ECHO) "\t(HOST)   MKDIR $(HDEST)"
+       @mkdir -p $(HDEST)
+
+$(EDEST):
+       @$(ECHO) "\t(HOST)   MKDIR $(EDEST)"
+       @mkdir -p $(EDEST)
+
+$(DEST):
+       @$(ECHO) "\t(HOST)   MKDIR $(DEST)"
+       @mkdir -p $(DEST)
+
+# === Host Rules ==========================================================
+$(HAPP): $(HOBJS)
+       @$(ECHO) "\t(HOST)   LINK\t$@"
+       @$(HCC) -o $@ $^ $(HLFLAGS)
+
+$(HDEST)/%.o: $(HSRC)/%.c
+       @$(ECHO) "\t(HOST)   CC\t$@"
+       @$(HCC) $(HCFLAGS) -c -o $@ $^
+
+# === Target Rules ========================================================
+$(DEST)/%.srec: $(EDEST)/%.elf
+       @$(ECHO) "\t(TARGET) OBJCOPY $@"
+       @$(EOC) $(EOFLAGS) --output-target srec --srec-forceS3 $^ $@
+
+$(EDEST)/%.elf: $(EDEST)/%.o $(ECOMMON)
+       @$(ECHO) "\t(TARGET) LINK\t$@"
+       @$(ECC) -o $@ $^ $(ELFLAGS)
+
+$(EDEST)/%.o: $(ESRC)/%.c
+       @$(ECHO) "\t(TARGET) CC\t$@"
+       @$(ECC) $(ECFLAGS) -c -o $@ $^
+# =========================================================================
+
diff --git a/d3q19/esrc/d3q19.c b/d3q19/esrc/d3q19.c
new file mode 100644 (file)
index 0000000..d50c11b
--- /dev/null
@@ -0,0 +1,262 @@
+/* Lattice Boltzmann Algorithm Implementation (D3Q19) */
+
+#include <e-lib.h>
+#include "../shared.h"
+
+/* core indices */
+extern unsigned int row, col, core;
+
+/* ================================================================== */
+
+static const int d3q19_v[19][3] = { { 0, 0, 0},
+       {-1, 0, 0}, { 0,-1, 0}, { 0, 0,-1}, {-1,-1, 0}, {-1, 1, 0}, {-1, 0,-1},
+       {-1, 0, 1}, { 0,-1,-1}, { 0,-1, 1}, { 1, 0, 0}, { 0, 1, 0}, { 0, 0, 1},
+       { 1, 1, 0}, { 1,-1, 0}, { 1, 0, 1}, { 1, 0,-1}, { 0, 1, 1}, { 0, 1,-1},
+};
+static const FLOAT d3q19_w[19] = { 1./3.,
+       1./18., 1./18., 1./18., 1./36., 1./36., 1./36.,
+       1./36., 1./36., 1./36., 1./18., 1./18., 1./18.,
+       1./36., 1./36., 1./36., 1./36., 1./36., 1./36.,
+};
+
+/* ================================================================== */
+
+void init(block_t f)
+{
+       /* initialize all nodes with rho = 0.1 */
+       for(int z = 0; z < NODES_Z; z++)
+               for(int y = 0; y < NODES_Y; y++)
+                       for(int x = 0; x < NODES_X; x++)
+                               for(int q = 0; q < 19; q++)
+                                       f[z][y][x][q] = 0.1 * d3q19_w[q];
+
+#if 1
+       /* except here */
+       if(core == 0)
+               for(int q = 0; q < 19; q++)
+                       f[1][1][1][q] = 0.2 * d3q19_w[q];
+#endif
+}
+
+/* ================================================================== */
+
+void collide(block_t f, int x, int y, int z, FLOAT omega)
+{
+#if 0
+       /* Zou/He boundary conditions */
+       if(row == 0 && z == 0) {                                /* top */
+               const FLOAT Ux = 0.0;   /* wall speed */
+               const FLOAT Uy = 0.2;
+               const FLOAT Uz = 0.0;
+
+               FLOAT rho = ( f[z][y][x][0]  + f[z][y][x][1]  + f[z][y][x][2]  +
+                             f[z][y][x][4]  + f[z][y][x][5]  + f[z][y][x][10] +
+                             f[z][y][x][11] + f[z][y][x][13] + f[z][y][x][14] +
+                       2 * ( f[z][y][x][16] + f[z][y][x][18] + f[z][y][x][3]  +
+                             f[z][y][x][6]  + f[z][y][x][8]  ) / (1 + Uz) );
+
+               FLOAT tmp0 = rho / 2;
+               FLOAT tmp1 = ( f[z][y][x][10] - f[z][y][x][1] ) / 2;
+               FLOAT tmp2 = ( f[z][y][x][11] - f[z][y][x][2] ) / 2;
+               FLOAT tmp3 = ( f[z][y][x][13] - f[z][y][x][4] ) / 2;
+               FLOAT tmp4 = ( f[z][y][x][14] - f[z][y][x][5] ) / 2;
+
+               f[z][y][x][15] = f[z][y][x][6]  - tmp1 - tmp3 - tmp4 -
+                       tmp0 * (Uz/3 - Ux);
+               f[z][y][x][7]  = f[z][y][x][16] + tmp1 + tmp3 + tmp4 -
+                       tmp0 * (Uz/3 + Ux);
+               f[z][y][x][17] = f[z][y][x][8]  - tmp2 - tmp3 + tmp4 -
+                       tmp0 * (Uz/3 - Uy);
+               f[z][y][x][9]  = f[z][y][x][18] + tmp2 + tmp3 - tmp4 -
+                       tmp0 * (Uz/3 + Uy);
+               f[z][y][x][12] = f[z][y][x][3] - rho * Uz / 3;
+
+       } else if(row == CORES_Y-1 && z == NODES_Z-1) {         /* bottom */
+               FLOAT tmp1 = ( f[z][y][x][10] - f[z][y][x][1] ) / 2;
+               FLOAT tmp2 = ( f[z][y][x][11] - f[z][y][x][2] ) / 2;
+               FLOAT tmp3 = ( f[z][y][x][13] - f[z][y][x][4] ) / 2;
+               FLOAT tmp4 = ( f[z][y][x][14] - f[z][y][x][5] ) / 2;
+
+               f[z][y][x][6]  = f[z][y][x][15] + tmp1 + tmp3 + tmp4;
+               f[z][y][x][16] = f[z][y][x][7]  - tmp1 - tmp3 - tmp4;
+               f[z][y][x][8]  = f[z][y][x][17] + tmp2 + tmp3 - tmp4;
+               f[z][y][x][18] = f[z][y][x][9]  - tmp2 - tmp3 + tmp4;
+               f[z][y][x][3]  = f[z][y][x][12];
+
+       } else if(y == 0) {                                     /* front */
+               FLOAT tmp1 = ( f[z][y][x][10] - f[z][y][x][1] ) / 2;
+               FLOAT tmp2 = ( f[z][y][x][12] - f[z][y][x][3] ) / 2;
+               FLOAT tmp3 = ( f[z][y][x][15] - f[z][y][x][6] ) / 2;
+               FLOAT tmp4 = ( f[z][y][x][16] - f[z][y][x][7] ) / 2;
+
+               f[z][y][x][13] = f[z][y][x][4]  - tmp1 - tmp3 - tmp4;
+               f[z][y][x][5]  = f[z][y][x][14] + tmp1 + tmp3 + tmp4;
+               f[z][y][x][17] = f[z][y][x][8]  - tmp2 - tmp3 - tmp4;
+               f[z][y][x][18] = f[z][y][x][9]  + tmp2 + tmp3 + tmp4;
+               f[z][y][x][11] = f[z][y][x][2];
+
+       } else if(y == NODES_Y-1) {                             /* back */
+               FLOAT tmp1 = ( f[z][y][x][10] - f[z][y][x][1] ) / 2;
+               FLOAT tmp2 = ( f[z][y][x][12] - f[z][y][x][3] ) / 2;
+               FLOAT tmp3 = ( f[z][y][x][15] - f[z][y][x][6] ) / 2;
+               FLOAT tmp4 = ( f[z][y][x][16] - f[z][y][x][7] ) / 2;
+
+               f[z][y][x][4]  = f[z][y][x][13] + tmp1 + tmp3 + tmp4;
+               f[z][y][x][14] = f[z][y][x][5]  - tmp1 - tmp3 - tmp4;
+               f[z][y][x][8]  = f[z][y][x][17] + tmp2 + tmp3 + tmp4;
+               f[z][y][x][9]  = f[z][y][x][18] - tmp2 - tmp3 - tmp4;
+               f[z][y][x][2]  = f[z][y][x][11];
+
+       } else if(col == 0 && x == 0) {                         /* left */
+               FLOAT tmp1 = ( f[z][y][x][11] - f[z][y][x][2] ) / 2;
+               FLOAT tmp2 = ( f[z][y][x][12] - f[z][y][x][3] ) / 2;
+               FLOAT tmp3 = ( f[z][y][x][17] - f[z][y][x][8] ) / 2;
+               FLOAT tmp4 = ( f[z][y][x][18] - f[z][y][x][9] ) / 2;
+
+               f[z][y][x][13] = f[z][y][x][4]  - tmp1 - tmp3 - tmp4;
+               f[z][y][x][14] = f[z][y][x][5]  + tmp1 + tmp3 + tmp4;
+               f[z][y][x][15] = f[z][y][x][6]  - tmp2 - tmp3 + tmp4;
+               f[z][y][x][16] = f[z][y][x][7]  + tmp2 + tmp3 - tmp4;
+               f[z][y][x][10] = f[z][y][x][1];
+
+       } else if(col == CORES_X-1 && x == NODES_X-1) {         /* right */
+               FLOAT tmp1 = ( f[z][y][x][11] - f[z][y][x][2] ) / 2;
+               FLOAT tmp2 = ( f[z][y][x][12] - f[z][y][x][3] ) / 2;
+               FLOAT tmp3 = ( f[z][y][x][17] - f[z][y][x][8] ) / 2;
+               FLOAT tmp4 = ( f[z][y][x][18] - f[z][y][x][9] ) / 2;
+
+               f[z][y][x][4]  = f[z][y][x][13] + tmp1 + tmp3 + tmp4;
+               f[z][y][x][5]  = f[z][y][x][14] - tmp1 - tmp3 - tmp4;
+               f[z][y][x][6]  = f[z][y][x][15] + tmp2 + tmp3 - tmp4;
+               f[z][y][x][7]  = f[z][y][x][16] - tmp2 - tmp3 + tmp4;
+               f[z][y][x][1]  = f[z][y][x][10];
+       }
+#endif
+
+       /* macroscopic */
+       FLOAT rho =
+               f[z][y][x][0] + f[z][y][x][1] + f[z][y][x][2] +
+               f[z][y][x][3] + f[z][y][x][4] + f[z][y][x][5] +
+               f[z][y][x][6] + f[z][y][x][7] + f[z][y][x][8] +
+               f[z][y][x][9] + f[z][y][x][10] + f[z][y][x][11] +
+               f[z][y][x][12] + f[z][y][x][13] + f[z][y][x][14] +
+               f[z][y][x][15] + f[z][y][x][16] + f[z][y][x][17] +
+               f[z][y][x][18];
+
+       FLOAT ux = (
+               f[z][y][x][10] + f[z][y][x][13] + f[z][y][x][14] +
+               f[z][y][x][15] + f[z][y][x][16] - f[z][y][x][1] -
+               f[z][y][x][4]  - f[z][y][x][5]  - f[z][y][x][6] -
+               f[z][y][x][7] ) / rho;
+
+       FLOAT uy = (
+               f[z][y][x][5]  + f[z][y][x][11] + f[z][y][x][13] +
+               f[z][y][x][17] + f[z][y][x][18] - f[z][y][x][2]  -
+               f[z][y][x][4]  - f[z][y][x][8]  - f[z][y][x][9]  -
+               f[z][y][x][14] ) / rho;
+
+       FLOAT uz = (
+               f[z][y][x][7]  + f[z][y][x][9]  + f[z][y][x][12] +
+               f[z][y][x][15] + f[z][y][x][17] - f[z][y][x][3]  -
+               f[z][y][x][6]  - f[z][y][x][8]  - f[z][y][x][16] -
+               f[z][y][x][18] ) / rho;
+
+       FLOAT sqr = 1.5 * (ux*ux + uy*uy + uz*uz);
+
+       /* update node */
+       for(int q = 0; q < 19; q++) {
+               FLOAT cu =
+                       ux*d3q19_v[q][0] + uy*d3q19_v[q][1] + uz*d3q19_v[q][2];
+               FLOAT eq = rho * d3q19_w[q] *
+                       (1. + 3. * cu + 4.5 * cu*cu - sqr);
+               f[z][y][x][q] *= (1.0 - omega);
+               f[z][y][x][q] += omega * eq;
+       }
+
+       /* swap */
+       for(int q = 1; q <= 9; q++) {
+               FLOAT tmp       = f[z][y][x][q];
+               f[z][y][x][q]   = f[z][y][x][q+9];
+               f[z][y][x][q+9] = tmp;
+       }
+}
+
+/* ================================================================== */
+
+void stream(block_t f, int x, int y, int z)
+{
+       for(int q = 1; q <= 9; q++) {
+               int next_row = row;
+               int next_col = col;
+               int next_x   = x + d3q19_v[q][0];
+               int next_y   = y + d3q19_v[q][1];
+               int next_z   = z + d3q19_v[q][2];
+
+               /* inner borders extend in xz-plane */
+               if(next_x < 0)        { next_col--; next_x += NODES_X; }
+               if(next_x >= NODES_X) { next_col++; next_x -= NODES_X; }
+               if(next_z < 0)        { next_row--; next_z += NODES_Z; }
+               if(next_z >= NODES_Z) { next_row++; next_z -= NODES_Z; }
+
+               /* outer borders are not streamed */
+               if(next_col < 0)             { continue; }      /* -x */
+               else if(next_col >= CORES_X) { continue; }      /* +x */
+               if(next_y < 0)               { continue; }      /* -y */
+               else if(next_y >= NODES_Y)   { continue; }      /* +y */
+               if(next_row < 0)             { continue; }      /* -z */
+               else if(next_row >= CORES_Y) { continue; }      /* +z */
+
+               /* f: local block, g: local/remote block */
+               block_t *g = (void*)f;
+               if(next_row != row || next_col != col) {
+                       g = e_get_global_address(next_col, next_row, (void*)f);
+               }
+
+               /* stream by swapping */
+               FLOAT tmp       = f[z][y][x][q+9];
+               f[z][y][x][q+9] = (*g)[next_z][next_y][next_x][q];
+               (*g)[next_z][next_y][next_x][q] = tmp;
+       }
+}
+
+/* ================================================================== */
+
+#if 0
+void d2q9_collide_stream_bulk(d2q9_block_t f, FLOAT omega)
+{
+       /* don't touch the border nodes */
+       for(int x = 1; x < BLOCKS_X-1; x++) {
+               for(int y = 1; y < BLOCKS_Y-1; y++) {
+                       /* macroscopic */
+                       FLOAT rho = f[y][x][0] + f[y][x][1] + f[y][x][2] +
+                               f[y][x][3] + f[y][x][4] + f[y][x][5] +
+                               f[y][x][6] + f[y][x][7] + f[y][x][8];
+                       FLOAT ux = (f[y][x][7] + f[y][x][6] + f[y][x][5] -
+                               f[y][x][1] - f[y][x][2] - f[y][x][3]) / rho;
+                       FLOAT uy = (f[y][x][1] + f[y][x][8] + f[y][x][7] -
+                               f[y][x][3] - f[y][x][4] - f[y][x][5]) / rho;
+                       FLOAT sqr = 1.5 * (ux*ux + uy*uy);
+
+                       /* update node */
+                       for(int q = 0; q < 9; q++) {
+                               FLOAT cu = ux*d2q9_v[q][0] + uy*d2q9_v[q][1];
+                               FLOAT eq = rho * d2q9_w[q] *
+                                       (1. + 3. * cu + 4.5 * cu*cu - sqr);
+                               f[y][x][q] *= (1.0 - omega);
+                               f[y][x][q] += omega * eq;
+                       }
+
+                       /* stream */
+                       for(int q = 0; q <= 4; q++) {
+                               int next_x = x + d2q9_v[q][0];
+                               int next_y = y + d2q9_v[q][1];
+
+                               FLOAT tmp    = f[y][x][q];
+                               f[y][x][q]   = f[y][x][q+4];
+                               f[y][x][q+4] = f[next_y][next_x][q];
+                               f[next_y][next_x][q] = tmp;
+                       }
+               }
+       }
+}
+#endif
diff --git a/d3q19/esrc/d3q19.h b/d3q19/esrc/d3q19.h
new file mode 100644 (file)
index 0000000..3b8f57a
--- /dev/null
@@ -0,0 +1,9 @@
+/* Lattice Boltzmann Algorithm Header */
+
+#include "../shared.h"
+
+void init               (block_t);
+void collide            (block_t, int x, int y, int z,  FLOAT);
+void stream             (block_t, int x, int y, int z);
+void collide_stream_bulk(block_t, FLOAT);
+
diff --git a/d3q19/esrc/main.c b/d3q19/esrc/main.c
new file mode 100644 (file)
index 0000000..625fb67
--- /dev/null
@@ -0,0 +1,139 @@
+/* Lattice Boltzmann Implementation */
+
+#include <stdint.h>
+#include <string.h>
+
+#include <e-lib.h>
+
+#include "d3q19.h"
+#include "../shared.h"
+
+/* shared memory overlay */
+volatile shm_t shm SECTION(".shared_dram");
+
+/* statically allocate dummy memory and local block overlay
+   to prevent linker from putting stuff in banks 1..3 */
+static uint8_t dummy_bank1[8192] USED SECTION(".data_bank1");
+static uint8_t dummy_bank2[8192] USED SECTION(".data_bank2");
+static uint8_t dummy_bank3[8192] USED SECTION(".data_bank3");
+static block_t *block = (void*)0x2000;
+
+/* barrier structures */
+volatile e_barrier_t  barriers[CORES];
+         e_barrier_t *tgt_bars[CORES];
+
+/* global index variables */
+unsigned int row, col, core;
+
+#define READ_TIMER(X) \
+       do { \
+               times[X] = E_CTIMER_MAX - e_ctimer_stop(E_CTIMER_0); \
+               e_ctimer_set(E_CTIMER_0, E_CTIMER_MAX); \
+               e_ctimer_start(E_CTIMER_0, E_CTIMER_CLK); \
+       } while(0);
+
+int main()
+{
+       const FLOAT omega = 1.0;
+       unsigned times[TIMERS] = {0};
+
+       /* compile-time bombs, for E16G3 */
+       BUILD_BUG(NODES * sizeof(node_t) > 24 * 1024);
+       BUILD_BUG(NODES_X < 3 || NODES_Y < 3 || NODES_Z < 3);
+       BUILD_BUG(CORES_X < 1 || CORES_Y < 1);
+       BUILD_BUG(CORES_X > 4 || CORES_Y > 4);
+
+       /* save mesh coordinates */
+       e_coords_from_coreid(e_get_coreid(), &col, &row);
+       core = row * CORES_X + col;
+
+       /* initialize barrier */
+       e_barrier_init(barriers, tgt_bars);
+
+       /* initialize lattice */
+       init(*block);
+
+       /* main loop */
+       for(int iter = 0; iter < 3000; iter++) {
+#if 1
+               /* collide all nodes */
+               for(int z = 0; z < NODES_Z; z++)
+                       for(int y = 0; y < NODES_Y; y++)
+                               for(int x = 0; x < NODES_X; x++)
+                                       collide(*block, x, y, z, omega);
+
+               /* synchronize */
+               e_barrier(barriers, tgt_bars);
+
+               /* stream all nodes */
+               for(int z = 0; z < NODES_Z; z++)
+                       for(int y = 0; y < NODES_Y; y++)
+                               for(int x = 0; x < NODES_X; x++)
+                                       stream(*block, x, y, z);
+
+#else
+               /* collide boundaries: top, bottom */
+               for(int x = 0; x < BLOCKS_X; x++) {
+                       d2q9_collide(*block, x, 0,          omega);
+                       d2q9_collide(*block, x, BLOCKS_Y-1, omega);
+               }
+
+               /* collide boundaries: left, right */
+               for(int y = 1; y < BLOCKS_Y-1; y++) {
+                       d2q9_collide(*block, 0,         y, omega);
+                       d2q9_collide(*block, BLOCKS_X-1, y, omega);
+               }
+
+               /* synchronize */
+               e_barrier(barriers, tgt_bars);
+
+               /* collide and stream the bulk */
+               d2q9_collide_stream_bulk(*block, omega);
+
+               /* stream the boundaries: top, bottom */
+               for(int x = 0; x < BLOCKS_X; x++) {
+                       d2q9_stream(*block, x, 0         );
+                       d2q9_stream(*block, x, BLOCKS_Y-1);
+               }
+
+               /* stream the boundaries: left, right */
+               for(int y = 1; y < BLOCKS_Y-1; y++) {
+                       d2q9_stream(*block, 0,          y);
+                       d2q9_stream(*block, BLOCKS_X-1, y);
+               }
+#endif
+
+               /* copy data to shm if necessary */
+               if(!(iter % 3)) {
+                       /* copy lattice to shm */
+                       memcpy(&shm.lattice[row][col], block, sizeof(block_t));
+
+                       /* copy times to shm */
+                       memcpy(&shm.times[row][col], times, sizeof(times_t));
+
+                       /* synchronize */
+                       e_barrier(barriers, tgt_bars);
+
+                       /* core 0: write counter and flag host; wait */
+                       if(core == 0) {
+                               shm.iteration = iter;
+                               shm.pollflag  = POLL_READY;
+                               while(shm.pollflag == POLL_READY);
+                       }
+               }
+
+               /* synchronize */
+               e_barrier(barriers, tgt_bars);
+       }
+
+       /* core 0: flag host, all iterations done */
+       if(core == 0) {
+               shm.pollflag = POLL_DONE;
+       }
+
+       /* stop core */
+       while(1) {
+               __asm__ volatile("idle");
+       }
+}
+
diff --git a/d3q19/hsrc/data.c b/d3q19/hsrc/data.c
new file mode 100644 (file)
index 0000000..bf20c45
--- /dev/null
@@ -0,0 +1,237 @@
+/* Helper Functions to handle data (3D) */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <unistd.h>
+
+#include "../shared.h"
+
+/* fix file owner if run with sudo */
+void fixsudo(const char *filename)
+{
+       if(getenv("SUDO_UID") && getenv("SUDO_GID")) {
+               int uid = atoi(getenv("SUDO_UID"));
+               int gid = atoi(getenv("SUDO_GID"));
+               if(chown(filename, uid, gid)) {
+                       perror("fixsudo/chown");
+                       return;
+               }
+       }
+}
+
+#if 0
+/* write a (semi-) human-readable dump of the lattice */
+void write_populations(d2q9_block_t lattice[CORES_Y][CORES_X], int iter)
+{
+       FILE *file = fopen("populations.dat", "a");
+       if(!file) {
+               perror("write_populations/fopen");
+               return;
+       }
+
+       for(int cy = 0; cy < CORES_Y; cy++) {
+               for(int y = 0; y < BLOCKS_Y; y++) {
+                       for(int cx = 0; cx < CORES_X; cx++) {
+                               for(int x = 0; x < BLOCKS_X; x++) {
+                                       fprintf(file, "%3d: [%3d,%3d]: ",
+                                               iter,
+                                               cx * BLOCKS_X + x,
+                                               cy * BLOCKS_Y + y
+                                       );
+                                       for(int q = 0; q < 9; q++) {
+                                               fprintf(file, "%.5f ", lattice[cy][cx][y][x][q]);
+                                       }
+                                       fprintf(file, "\n");
+                               }
+                       }
+               }
+       }
+       fprintf(file, "\n");
+
+       /* close */
+       fclose(file);
+
+       return;
+}
+#endif
+
+/* write 8-bit grayscale PPM of particle density in a slice */
+void write_density(block_t f[CORES_Y][CORES_X], int iter)
+{
+       char name[32];
+       FLOAT min = 1.0, max = 0.0;
+       FLOAT rhos[LATTICE_Z][LATTICE_Y][LATTICE_X];
+
+       /* write file header */
+       snprintf(name, 32, "./tmp/i%06d.ppm", iter);
+       FILE *file = fopen(name, "wb");
+       if(!file) {
+               perror("write_density/fopen");
+               return;
+       }
+       fprintf(file, "P5\n%d %d\n%d\n", LATTICE_X, LATTICE_Z, 255);
+
+       /* calculate densities from grid */
+       for(int cy = 0; cy < CORES_Y; cy++)
+               for(int z = 0; z < NODES_Z; z++)
+                       for(int y = 0; y < NODES_Y; y++)
+                               for(int cx = 0; cx < CORES_X; cx++)
+                                       for(int x = 0; x < NODES_X; x++)
+       {
+               const int idx[3] = { cx * NODES_X + x, y, cy * NODES_Z + z};
+
+               FLOAT rho = 0;
+               for(int q = 0; q < 19; q++)
+                       rho += f[cy][cx][z][y][x][q];
+
+               if(rho < min) min = rho;
+               if(rho > max) max = rho;
+               rhos[idx[2]][idx[1]][idx[0]] = rho;
+       }
+
+       /* write slice of grid */
+       for(int z = 0; z < LATTICE_Z; z++) {
+               for(int x = 0; x < LATTICE_X; x++) {
+                       FLOAT value = rhos[z][NODES_Y/2][x];
+
+                       unsigned char gray = ( 255 *
+                               (value - min) / (max - min) );
+                       fwrite(&gray, 1, 1, file);
+               }
+       }
+
+       /* close the file and fix permissions */
+       fclose(file);
+       fixsudo(name);
+
+       return;
+}
+
+/* write 8-bit grayscale PPM of absolute particle velocity in a slice */
+void write_velocity(block_t f[CORES_Y][CORES_X], int iter)
+{
+       char name[32];
+       FLOAT min = 1.0, max = 0.0;
+       FLOAT vs[LATTICE_Z][LATTICE_Y][LATTICE_X];
+
+       /* write file header */
+       snprintf(name, 32, "./tmp/i%06d.ppm", iter);
+       FILE *file = fopen(name, "wb");
+       if(!file) {
+               perror("write_density/fopen");
+               return;
+       }
+       fprintf(file, "P5\n%d %d\n%d\n", LATTICE_X, LATTICE_Z, 255);
+
+       /* calculate velocities from grid */
+       for(int cy = 0; cy < CORES_Y; cy++)
+               for(int z = 0; z < NODES_Z; z++)
+                       for(int y = 0; y < NODES_Y; y++)
+                               for(int cx = 0; cx < CORES_X; cx++)
+                                       for(int x = 0; x < NODES_X; x++)
+       {
+               const int idx[3] = { cx * NODES_X + x, y, cy * NODES_Z + z};
+
+               FLOAT rho = 0;
+               for(int q = 0; q < 19; q++)
+                       rho += f[cy][cx][z][y][x][q];
+
+               FLOAT ux = (
+                       f[cy][cx][z][y][x][10] + f[cy][cx][z][y][x][13] +
+                       f[cy][cx][z][y][x][14] + f[cy][cx][z][y][x][15] +
+                       f[cy][cx][z][y][x][16] - f[cy][cx][z][y][x][1]  -
+                       f[cy][cx][z][y][x][4]  - f[cy][cx][z][y][x][5]  -
+                       f[cy][cx][z][y][x][6]  - f[cy][cx][z][y][x][7]
+                       ) / rho;
+
+               FLOAT uy = (
+                       f[cy][cx][z][y][x][5]  + f[cy][cx][z][y][x][11] +
+                       f[cy][cx][z][y][x][13] + f[cy][cx][z][y][x][17] +
+                       f[cy][cx][z][y][x][18] - f[cy][cx][z][y][x][2]  -
+                       f[cy][cx][z][y][x][4]  - f[cy][cx][z][y][x][8]  -
+                       f[cy][cx][z][y][x][9]  - f[cy][cx][z][y][x][14]
+                       ) / rho;
+
+               FLOAT uz = (
+                       f[cy][cx][z][y][x][7]  + f[cy][cx][z][y][x][9]  +
+                       f[cy][cx][z][y][x][12] + f[cy][cx][z][y][x][15] +
+                       f[cy][cx][z][y][x][17] - f[cy][cx][z][y][x][3]  -
+                       f[cy][cx][z][y][x][6]  - f[cy][cx][z][y][x][8]  -
+                       f[cy][cx][z][y][x][16] - f[cy][cx][z][y][x][18]
+                       ) / rho;
+
+               FLOAT u = sqrtf(ux*ux + uy*uy + uz*uz);
+               if(u < min) min = u;
+               if(u > max) max = u;
+               vs[idx[2]][idx[1]][idx[0]] = u;
+       }
+
+       /* write slice of grid */
+       for(int z = 0; z < LATTICE_Z; z++) {
+               for(int x = 0; x < LATTICE_X; x++) {
+                       FLOAT value = vs[z][NODES_Y/2][x];
+
+                       unsigned char gray = ( 255 *
+                               (value - min) / (max - min) );
+                       fwrite(&gray, 1, 1, file);
+               }
+       }
+
+       /* close the file and fix permissions */
+       fclose(file);
+       fixsudo(name);
+
+       return;
+}
+
+/* convert image files to animated gif ./tmp/anim.gif */
+void convert_to_gif(void)
+{
+       int result;
+
+       /* call imagemagick */
+       result = system("convert ./tmp/i*.ppm ./tmp/anim.gif"); (void)result;
+       fixsudo("./tmp/anim.gif");
+
+       return;
+}
+
+/* convert image files to mp4 ./tmp/anim.mp4 */
+void convert_to_mp4(void)
+{
+       int result;
+
+       /* call ffmpeg
+          FIXME: requires ffmpeg binary with '-pattern_type glob' support,
+                 so this will not work on the Parallella without updates! */
+       result = system("ffmpeg -v quiet -pattern_type glob -i "
+                       "'./tmp/i*.ppm' ./tmp/anim.mp4"); (void)result;
+       fixsudo("./tmp/anim.mp4");
+
+       return;
+}
+
+/* write timer values */
+void write_timers(times_t t[CORES_Y][CORES_X], uint32_t iter)
+{
+       FILE *file = fopen("timers.dat", "ab");
+       if(!file) {
+               perror("write_timers/fopen");
+               return;
+       }
+
+       fprintf(file, "Iteration: i=%d\n", iter);
+       for(int y = 0; y < CORES_Y; y++) {
+               for(int x = 0; x < CORES_X; x++) {
+                       fprintf(file, "[%d,%d]: ", x, y);
+                       for(int i = 0; i < TIMERS; i++) {
+                               fprintf(file, "%8d ", t[y][x][i]);
+                       }
+                       fprintf(file, "\n");
+               }
+       }
+
+       fclose(file);
+}
+
diff --git a/d3q19/hsrc/main.c b/d3q19/hsrc/main.c
new file mode 100644 (file)
index 0000000..e0363b3
--- /dev/null
@@ -0,0 +1,117 @@
+/* Host Application */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <errno.h>
+#include <unistd.h>
+
+#include <e-hal.h>
+#include "../shared.h"
+
+#define FAIL(...) { fprintf(stderr, __VA_ARGS__); exit(1); }
+#define SHM_OFFSET 0x01000000
+
+/* helper functions */
+void fixsudo(const char *filename);
+void write_populations(block_t f[CORES_Y][CORES_X], int iter);
+void write_density    (block_t f[CORES_Y][CORES_X], int iter);
+void write_velocity   (block_t f[CORES_Y][CORES_X], int iter);
+void convert_to_gif(void);
+void convert_to_mp4(void);
+void write_timers(times_t t[CORES_Y][CORES_X], int iter);
+
+/* globals */
+static shm_t    shm = { 0 };   /* local shm copy */
+static uint32_t pollflag;
+
+int main()
+{
+       char *filename = "bin/main.srec";
+
+       /* remove old results */
+       int dummy = system("rm -f ./tmp/i*.ppm ./tmp/anim.gif ./tmp/anim.mp4 populations.dat timers.dat");
+       (void)dummy;
+
+       e_epiphany_t dev;
+       e_mem_t      mem;
+
+       e_set_host_verbosity(H_D0);
+       e_set_loader_verbosity(L_D0);
+
+       /* initialize workgroup, allocate and clear shared memory */
+       if(e_init(NULL) != E_OK)
+               FAIL("Can't init!\n");
+       e_reset_system();
+       if(e_open(&dev, 0, 0, CORES_X, CORES_Y) != E_OK)
+               FAIL("Can't open!\n");
+       if(e_alloc(&mem, SHM_OFFSET, sizeof(shm_t)) != E_OK)
+               FAIL("Can't alloc!\n");
+       if(e_write(&mem, 0, 0, (off_t)0, &shm, sizeof(shm_t)) == E_ERR)
+               FAIL("Can't clear shm!\n");
+
+       /* load programs */
+       printf("Starting cores:\n");
+       for(int y = 0; y < CORES_Y; y++) {
+               for(int x = 0; x < CORES_X; x++) {
+                       printf("(%02d,%02d) ", x, y);
+                       if(e_load(filename, &dev, x, y, E_TRUE) != E_OK)
+                               FAIL("Can't load!\n");
+               }
+               printf("\n");
+       }
+
+       /* ================================================================ */
+       printf("Polling shared memory.\n");
+       while(1) {
+
+               while(1) {
+                       /* read polling flag */
+                       if(e_read(&mem, 0, 0, (off_t)0, &pollflag,
+                               sizeof(uint32_t)) == E_ERR)
+                                       FAIL("Can't read pollflag!\n");
+
+                       /* wait for something */
+                       if(pollflag != POLL_BUSY) break;
+               }
+
+               /* finish if done */
+               if(pollflag == POLL_DONE) break;
+
+               /* read full shared memory */
+               if(e_read(&mem, 0, 0, (off_t)0, &shm, sizeof(shm_t)) == E_ERR)
+                       FAIL("Can't read full shm!\n");
+
+               /* reset pollflag */
+               pollflag = 0;
+               if(e_write(&mem, 0, 0, (off_t)0, &pollflag,
+                       sizeof(uint32_t)) == E_ERR) {
+                               FAIL("Can't reset pollflag!\n");
+               }
+
+               /* print iteration */
+               printf("0x%08x\r", shm.iteration); fflush(stdout);
+
+               /* write data */
+               //write_populations(shm.lattice, shm.iteration);
+               write_density(shm.lattice, shm.iteration);
+               //write_velocity(shm.lattice, shm.iteration);
+               write_timers(shm.times, shm.iteration);
+       }
+       /* ================================================================ */
+
+       if(e_free(&mem)  != E_OK) FAIL("Can't free!\n");
+       if(e_close(&dev) != E_OK) FAIL("Can't close!\n");
+       if(e_finalize()  != E_OK) FAIL("Can't finalize!\n");
+
+       fixsudo("populations.dat");
+       fixsudo("timers.dat");
+
+       printf("\nProgram finished successfully.\n");
+       printf("Convert ...\n");
+       convert_to_gif();
+
+       return(0);
+}
+
diff --git a/d3q19/shared.h b/d3q19/shared.h
new file mode 100644 (file)
index 0000000..b0c278e
--- /dev/null
@@ -0,0 +1,58 @@
+/* shared data types and external memory layout */
+#ifndef _SHARED_H_
+#define _SHARED_H_
+
+#include <stdint.h>
+
+/* ================================================================== */
+
+/* number of cores */
+#define CORES_X 4
+#define CORES_Y 4
+
+/* number of nodes per core */
+#define NODES_X 10
+#define NODES_Y 3
+#define NODES_Z 10
+
+/* ================================================================== */
+
+/* preprocessor magic */
+#define BUILD_BUG(c) do { ((void)sizeof(char[1 - 2*!!(c)])); } while(0);
+#define USED __attribute__((used))
+#undef PACKED
+#define PACKED __attribute__((packed))
+#undef ALIGN
+#define ALIGN(X) __attribute__((aligned(X)))
+
+/* some calculations */
+#define CORES     (CORES_X * CORES_Y)
+#define NODES     (NODES_X * NODES_Y * NODES_Z)
+#define LATTICE_X (NODES_X * CORES_X)
+#define LATTICE_Y (NODES_Y)
+#define LATTICE_Z (NODES_Z * CORES_Y)
+
+/* number of timer values */
+#define TIMERS 12
+
+/* pollflag values */
+#define POLL_BUSY  0x00
+#define POLL_READY 0x01
+#define POLL_DONE  0x02
+
+/* data types */
+typedef float    FLOAT;
+typedef FLOAT    node_t[19];
+typedef node_t   block_t[NODES_Z][NODES_Y][NODES_X];
+typedef uint32_t times_t[TIMERS];
+
+/* shared memory structure */
+typedef struct {
+       uint32_t pollflag;
+       uint32_t iteration;
+       times_t  times[CORES_Y][CORES_X];
+       block_t  lattice[CORES_Y][CORES_X];
+} ALIGN(8) shm_t;
+
+#endif /* _SHARED_H_ */
+