--- /dev/null
+# Template Makefile for Epiphany
+
+# host toolchain
+HCC = gcc
+HCFLAGS = -O2 -std=c99 -I$(EPIPHANY_HOME)/tools/host/include -Wall
+HLFLAGS = -lm -L$(EPIPHANY_HOME)/tools/host/lib -le-hal
+ECHO = /bin/echo -e
+
+# target toolchain
+ECC = e-gcc
+EOC = e-objcopy
+ECFLAGS = -Os -std=c99 -falign-loops=8 -falign-functions=8 -Wall -fsingle-precision-constant -ffast-math
+ELFLAGS = -T$(EPIPHANY_HOME)/bsps/current/internal.ldf -le-lib
+EOFLAGS = -R .shared_dram -R .data_bank1 -R .data_bank2 -R .data_bank3
+
+# host application
+HAPP = $(DEST)/ep_main
+HOBJS = $(HDEST)/main.o $(HDEST)/data.o
+
+# epiphany applications
+EAPPS = $(DEST)/main.srec
+ECOMMON = $(EDEST)/d3q19.o
+
+# folders
+HSRC = hsrc
+HDEST = hobj
+ESRC = esrc
+EDEST = eobj
+DEST = bin
+
+# === Magic begins here ===================================================
+EOBJS = $(EAPPS:$(DEST)%srec=$(EDEST)%o) $(ECOMMON)
+EELFS = $(EAPPS:$(DEST)%srec=$(EDEST)%elf)
+
+.SECONDARY:
+.PHONY: all help host target folders run clean
+.NOTPARALLEL: clean
+
+# === Phony Rules =========================================================
+help:
+ @$(ECHO)
+ @$(ECHO) "Epiphany Makefile - Help"
+ @$(ECHO) " help show this help"
+ @$(ECHO) " host build host application ($(HAPP))"
+ @$(ECHO) " target build epiphany applications ($(EAPPS))"
+ @$(ECHO) " all build all"
+ @$(ECHO) " run build all, then run host application"
+ @$(ECHO) " clean remove applications and intermediate files"
+ @$(ECHO)
+
+all: host target
+
+host: folders $(HAPP)
+
+target: folders $(EAPPS)
+
+folders: $(HDEST) $(EDEST) $(DEST)
+
+run: host target
+ @$(ECHO) "\tRUN"
+ @sudo LD_LIBRARY_PATH=$(LD_LIBRARY_PATH) \
+ EPIPHANY_HDF=$(EPIPHANY_HDF) \
+ $(HAPP)
+
+clean:
+ @$(ECHO) "\tCLEAN"
+ @rm -v -f $(HAPP) $(HOBJS) $(EAPPS) $(EELFS) $(EOBJS)
+ @-rmdir -v --ignore-fail-on-non-empty $(HDEST) $(EDEST) $(DEST) \
+ 2>/dev/null
+
+$(HDEST):
+ @$(ECHO) "\t(HOST) MKDIR $(HDEST)"
+ @mkdir -p $(HDEST)
+
+$(EDEST):
+ @$(ECHO) "\t(HOST) MKDIR $(EDEST)"
+ @mkdir -p $(EDEST)
+
+$(DEST):
+ @$(ECHO) "\t(HOST) MKDIR $(DEST)"
+ @mkdir -p $(DEST)
+
+# === Host Rules ==========================================================
+$(HAPP): $(HOBJS)
+ @$(ECHO) "\t(HOST) LINK\t$@"
+ @$(HCC) -o $@ $^ $(HLFLAGS)
+
+$(HDEST)/%.o: $(HSRC)/%.c
+ @$(ECHO) "\t(HOST) CC\t$@"
+ @$(HCC) $(HCFLAGS) -c -o $@ $^
+
+# === Target Rules ========================================================
+$(DEST)/%.srec: $(EDEST)/%.elf
+ @$(ECHO) "\t(TARGET) OBJCOPY $@"
+ @$(EOC) $(EOFLAGS) --output-target srec --srec-forceS3 $^ $@
+
+$(EDEST)/%.elf: $(EDEST)/%.o $(ECOMMON)
+ @$(ECHO) "\t(TARGET) LINK\t$@"
+ @$(ECC) -o $@ $^ $(ELFLAGS)
+
+$(EDEST)/%.o: $(ESRC)/%.c
+ @$(ECHO) "\t(TARGET) CC\t$@"
+ @$(ECC) $(ECFLAGS) -c -o $@ $^
+# =========================================================================
+
--- /dev/null
+/* Lattice Boltzmann Algorithm Implementation (D3Q19) */
+
+#include <e-lib.h>
+#include "../shared.h"
+
+/* core indices */
+extern unsigned int row, col, core;
+
+/* ================================================================== */
+
+static const int d3q19_v[19][3] = { { 0, 0, 0},
+ {-1, 0, 0}, { 0,-1, 0}, { 0, 0,-1}, {-1,-1, 0}, {-1, 1, 0}, {-1, 0,-1},
+ {-1, 0, 1}, { 0,-1,-1}, { 0,-1, 1}, { 1, 0, 0}, { 0, 1, 0}, { 0, 0, 1},
+ { 1, 1, 0}, { 1,-1, 0}, { 1, 0, 1}, { 1, 0,-1}, { 0, 1, 1}, { 0, 1,-1},
+};
+static const FLOAT d3q19_w[19] = { 1./3.,
+ 1./18., 1./18., 1./18., 1./36., 1./36., 1./36.,
+ 1./36., 1./36., 1./36., 1./18., 1./18., 1./18.,
+ 1./36., 1./36., 1./36., 1./36., 1./36., 1./36.,
+};
+
+/* ================================================================== */
+
+void init(block_t f)
+{
+ /* initialize all nodes with rho = 0.1 */
+ for(int z = 0; z < NODES_Z; z++)
+ for(int y = 0; y < NODES_Y; y++)
+ for(int x = 0; x < NODES_X; x++)
+ for(int q = 0; q < 19; q++)
+ f[z][y][x][q] = 0.1 * d3q19_w[q];
+
+#if 1
+ /* except here */
+ if(core == 0)
+ for(int q = 0; q < 19; q++)
+ f[1][1][1][q] = 0.2 * d3q19_w[q];
+#endif
+}
+
+/* ================================================================== */
+
+void collide(block_t f, int x, int y, int z, FLOAT omega)
+{
+#if 0
+ /* Zou/He boundary conditions */
+ if(row == 0 && z == 0) { /* top */
+ const FLOAT Ux = 0.0; /* wall speed */
+ const FLOAT Uy = 0.2;
+ const FLOAT Uz = 0.0;
+
+ FLOAT rho = ( f[z][y][x][0] + f[z][y][x][1] + f[z][y][x][2] +
+ f[z][y][x][4] + f[z][y][x][5] + f[z][y][x][10] +
+ f[z][y][x][11] + f[z][y][x][13] + f[z][y][x][14] +
+ 2 * ( f[z][y][x][16] + f[z][y][x][18] + f[z][y][x][3] +
+ f[z][y][x][6] + f[z][y][x][8] ) / (1 + Uz) );
+
+ FLOAT tmp0 = rho / 2;
+ FLOAT tmp1 = ( f[z][y][x][10] - f[z][y][x][1] ) / 2;
+ FLOAT tmp2 = ( f[z][y][x][11] - f[z][y][x][2] ) / 2;
+ FLOAT tmp3 = ( f[z][y][x][13] - f[z][y][x][4] ) / 2;
+ FLOAT tmp4 = ( f[z][y][x][14] - f[z][y][x][5] ) / 2;
+
+ f[z][y][x][15] = f[z][y][x][6] - tmp1 - tmp3 - tmp4 -
+ tmp0 * (Uz/3 - Ux);
+ f[z][y][x][7] = f[z][y][x][16] + tmp1 + tmp3 + tmp4 -
+ tmp0 * (Uz/3 + Ux);
+ f[z][y][x][17] = f[z][y][x][8] - tmp2 - tmp3 + tmp4 -
+ tmp0 * (Uz/3 - Uy);
+ f[z][y][x][9] = f[z][y][x][18] + tmp2 + tmp3 - tmp4 -
+ tmp0 * (Uz/3 + Uy);
+ f[z][y][x][12] = f[z][y][x][3] - rho * Uz / 3;
+
+ } else if(row == CORES_Y-1 && z == NODES_Z-1) { /* bottom */
+ FLOAT tmp1 = ( f[z][y][x][10] - f[z][y][x][1] ) / 2;
+ FLOAT tmp2 = ( f[z][y][x][11] - f[z][y][x][2] ) / 2;
+ FLOAT tmp3 = ( f[z][y][x][13] - f[z][y][x][4] ) / 2;
+ FLOAT tmp4 = ( f[z][y][x][14] - f[z][y][x][5] ) / 2;
+
+ f[z][y][x][6] = f[z][y][x][15] + tmp1 + tmp3 + tmp4;
+ f[z][y][x][16] = f[z][y][x][7] - tmp1 - tmp3 - tmp4;
+ f[z][y][x][8] = f[z][y][x][17] + tmp2 + tmp3 - tmp4;
+ f[z][y][x][18] = f[z][y][x][9] - tmp2 - tmp3 + tmp4;
+ f[z][y][x][3] = f[z][y][x][12];
+
+ } else if(y == 0) { /* front */
+ FLOAT tmp1 = ( f[z][y][x][10] - f[z][y][x][1] ) / 2;
+ FLOAT tmp2 = ( f[z][y][x][12] - f[z][y][x][3] ) / 2;
+ FLOAT tmp3 = ( f[z][y][x][15] - f[z][y][x][6] ) / 2;
+ FLOAT tmp4 = ( f[z][y][x][16] - f[z][y][x][7] ) / 2;
+
+ f[z][y][x][13] = f[z][y][x][4] - tmp1 - tmp3 - tmp4;
+ f[z][y][x][5] = f[z][y][x][14] + tmp1 + tmp3 + tmp4;
+ f[z][y][x][17] = f[z][y][x][8] - tmp2 - tmp3 - tmp4;
+ f[z][y][x][18] = f[z][y][x][9] + tmp2 + tmp3 + tmp4;
+ f[z][y][x][11] = f[z][y][x][2];
+
+ } else if(y == NODES_Y-1) { /* back */
+ FLOAT tmp1 = ( f[z][y][x][10] - f[z][y][x][1] ) / 2;
+ FLOAT tmp2 = ( f[z][y][x][12] - f[z][y][x][3] ) / 2;
+ FLOAT tmp3 = ( f[z][y][x][15] - f[z][y][x][6] ) / 2;
+ FLOAT tmp4 = ( f[z][y][x][16] - f[z][y][x][7] ) / 2;
+
+ f[z][y][x][4] = f[z][y][x][13] + tmp1 + tmp3 + tmp4;
+ f[z][y][x][14] = f[z][y][x][5] - tmp1 - tmp3 - tmp4;
+ f[z][y][x][8] = f[z][y][x][17] + tmp2 + tmp3 + tmp4;
+ f[z][y][x][9] = f[z][y][x][18] - tmp2 - tmp3 - tmp4;
+ f[z][y][x][2] = f[z][y][x][11];
+
+ } else if(col == 0 && x == 0) { /* left */
+ FLOAT tmp1 = ( f[z][y][x][11] - f[z][y][x][2] ) / 2;
+ FLOAT tmp2 = ( f[z][y][x][12] - f[z][y][x][3] ) / 2;
+ FLOAT tmp3 = ( f[z][y][x][17] - f[z][y][x][8] ) / 2;
+ FLOAT tmp4 = ( f[z][y][x][18] - f[z][y][x][9] ) / 2;
+
+ f[z][y][x][13] = f[z][y][x][4] - tmp1 - tmp3 - tmp4;
+ f[z][y][x][14] = f[z][y][x][5] + tmp1 + tmp3 + tmp4;
+ f[z][y][x][15] = f[z][y][x][6] - tmp2 - tmp3 + tmp4;
+ f[z][y][x][16] = f[z][y][x][7] + tmp2 + tmp3 - tmp4;
+ f[z][y][x][10] = f[z][y][x][1];
+
+ } else if(col == CORES_X-1 && x == NODES_X-1) { /* right */
+ FLOAT tmp1 = ( f[z][y][x][11] - f[z][y][x][2] ) / 2;
+ FLOAT tmp2 = ( f[z][y][x][12] - f[z][y][x][3] ) / 2;
+ FLOAT tmp3 = ( f[z][y][x][17] - f[z][y][x][8] ) / 2;
+ FLOAT tmp4 = ( f[z][y][x][18] - f[z][y][x][9] ) / 2;
+
+ f[z][y][x][4] = f[z][y][x][13] + tmp1 + tmp3 + tmp4;
+ f[z][y][x][5] = f[z][y][x][14] - tmp1 - tmp3 - tmp4;
+ f[z][y][x][6] = f[z][y][x][15] + tmp2 + tmp3 - tmp4;
+ f[z][y][x][7] = f[z][y][x][16] - tmp2 - tmp3 + tmp4;
+ f[z][y][x][1] = f[z][y][x][10];
+ }
+#endif
+
+ /* macroscopic */
+ FLOAT rho =
+ f[z][y][x][0] + f[z][y][x][1] + f[z][y][x][2] +
+ f[z][y][x][3] + f[z][y][x][4] + f[z][y][x][5] +
+ f[z][y][x][6] + f[z][y][x][7] + f[z][y][x][8] +
+ f[z][y][x][9] + f[z][y][x][10] + f[z][y][x][11] +
+ f[z][y][x][12] + f[z][y][x][13] + f[z][y][x][14] +
+ f[z][y][x][15] + f[z][y][x][16] + f[z][y][x][17] +
+ f[z][y][x][18];
+
+ FLOAT ux = (
+ f[z][y][x][10] + f[z][y][x][13] + f[z][y][x][14] +
+ f[z][y][x][15] + f[z][y][x][16] - f[z][y][x][1] -
+ f[z][y][x][4] - f[z][y][x][5] - f[z][y][x][6] -
+ f[z][y][x][7] ) / rho;
+
+ FLOAT uy = (
+ f[z][y][x][5] + f[z][y][x][11] + f[z][y][x][13] +
+ f[z][y][x][17] + f[z][y][x][18] - f[z][y][x][2] -
+ f[z][y][x][4] - f[z][y][x][8] - f[z][y][x][9] -
+ f[z][y][x][14] ) / rho;
+
+ FLOAT uz = (
+ f[z][y][x][7] + f[z][y][x][9] + f[z][y][x][12] +
+ f[z][y][x][15] + f[z][y][x][17] - f[z][y][x][3] -
+ f[z][y][x][6] - f[z][y][x][8] - f[z][y][x][16] -
+ f[z][y][x][18] ) / rho;
+
+ FLOAT sqr = 1.5 * (ux*ux + uy*uy + uz*uz);
+
+ /* update node */
+ for(int q = 0; q < 19; q++) {
+ FLOAT cu =
+ ux*d3q19_v[q][0] + uy*d3q19_v[q][1] + uz*d3q19_v[q][2];
+ FLOAT eq = rho * d3q19_w[q] *
+ (1. + 3. * cu + 4.5 * cu*cu - sqr);
+ f[z][y][x][q] *= (1.0 - omega);
+ f[z][y][x][q] += omega * eq;
+ }
+
+ /* swap */
+ for(int q = 1; q <= 9; q++) {
+ FLOAT tmp = f[z][y][x][q];
+ f[z][y][x][q] = f[z][y][x][q+9];
+ f[z][y][x][q+9] = tmp;
+ }
+}
+
+/* ================================================================== */
+
+void stream(block_t f, int x, int y, int z)
+{
+ for(int q = 1; q <= 9; q++) {
+ int next_row = row;
+ int next_col = col;
+ int next_x = x + d3q19_v[q][0];
+ int next_y = y + d3q19_v[q][1];
+ int next_z = z + d3q19_v[q][2];
+
+ /* inner borders extend in xz-plane */
+ if(next_x < 0) { next_col--; next_x += NODES_X; }
+ if(next_x >= NODES_X) { next_col++; next_x -= NODES_X; }
+ if(next_z < 0) { next_row--; next_z += NODES_Z; }
+ if(next_z >= NODES_Z) { next_row++; next_z -= NODES_Z; }
+
+ /* outer borders are not streamed */
+ if(next_col < 0) { continue; } /* -x */
+ else if(next_col >= CORES_X) { continue; } /* +x */
+ if(next_y < 0) { continue; } /* -y */
+ else if(next_y >= NODES_Y) { continue; } /* +y */
+ if(next_row < 0) { continue; } /* -z */
+ else if(next_row >= CORES_Y) { continue; } /* +z */
+
+ /* f: local block, g: local/remote block */
+ block_t *g = (void*)f;
+ if(next_row != row || next_col != col) {
+ g = e_get_global_address(next_col, next_row, (void*)f);
+ }
+
+ /* stream by swapping */
+ FLOAT tmp = f[z][y][x][q+9];
+ f[z][y][x][q+9] = (*g)[next_z][next_y][next_x][q];
+ (*g)[next_z][next_y][next_x][q] = tmp;
+ }
+}
+
+/* ================================================================== */
+
+#if 0
+void d2q9_collide_stream_bulk(d2q9_block_t f, FLOAT omega)
+{
+ /* don't touch the border nodes */
+ for(int x = 1; x < BLOCKS_X-1; x++) {
+ for(int y = 1; y < BLOCKS_Y-1; y++) {
+ /* macroscopic */
+ FLOAT rho = f[y][x][0] + f[y][x][1] + f[y][x][2] +
+ f[y][x][3] + f[y][x][4] + f[y][x][5] +
+ f[y][x][6] + f[y][x][7] + f[y][x][8];
+ FLOAT ux = (f[y][x][7] + f[y][x][6] + f[y][x][5] -
+ f[y][x][1] - f[y][x][2] - f[y][x][3]) / rho;
+ FLOAT uy = (f[y][x][1] + f[y][x][8] + f[y][x][7] -
+ f[y][x][3] - f[y][x][4] - f[y][x][5]) / rho;
+ FLOAT sqr = 1.5 * (ux*ux + uy*uy);
+
+ /* update node */
+ for(int q = 0; q < 9; q++) {
+ FLOAT cu = ux*d2q9_v[q][0] + uy*d2q9_v[q][1];
+ FLOAT eq = rho * d2q9_w[q] *
+ (1. + 3. * cu + 4.5 * cu*cu - sqr);
+ f[y][x][q] *= (1.0 - omega);
+ f[y][x][q] += omega * eq;
+ }
+
+ /* stream */
+ for(int q = 0; q <= 4; q++) {
+ int next_x = x + d2q9_v[q][0];
+ int next_y = y + d2q9_v[q][1];
+
+ FLOAT tmp = f[y][x][q];
+ f[y][x][q] = f[y][x][q+4];
+ f[y][x][q+4] = f[next_y][next_x][q];
+ f[next_y][next_x][q] = tmp;
+ }
+ }
+ }
+}
+#endif
--- /dev/null
+/* Lattice Boltzmann Algorithm Header */
+
+#include "../shared.h"
+
+void init (block_t);
+void collide (block_t, int x, int y, int z, FLOAT);
+void stream (block_t, int x, int y, int z);
+void collide_stream_bulk(block_t, FLOAT);
+
--- /dev/null
+/* Lattice Boltzmann Implementation */
+
+#include <stdint.h>
+#include <string.h>
+
+#include <e-lib.h>
+
+#include "d3q19.h"
+#include "../shared.h"
+
+/* shared memory overlay */
+volatile shm_t shm SECTION(".shared_dram");
+
+/* statically allocate dummy memory and local block overlay
+ to prevent linker from putting stuff in banks 1..3 */
+static uint8_t dummy_bank1[8192] USED SECTION(".data_bank1");
+static uint8_t dummy_bank2[8192] USED SECTION(".data_bank2");
+static uint8_t dummy_bank3[8192] USED SECTION(".data_bank3");
+static block_t *block = (void*)0x2000;
+
+/* barrier structures */
+volatile e_barrier_t barriers[CORES];
+ e_barrier_t *tgt_bars[CORES];
+
+/* global index variables */
+unsigned int row, col, core;
+
+#define READ_TIMER(X) \
+ do { \
+ times[X] = E_CTIMER_MAX - e_ctimer_stop(E_CTIMER_0); \
+ e_ctimer_set(E_CTIMER_0, E_CTIMER_MAX); \
+ e_ctimer_start(E_CTIMER_0, E_CTIMER_CLK); \
+ } while(0);
+
+int main()
+{
+ const FLOAT omega = 1.0;
+ unsigned times[TIMERS] = {0};
+
+ /* compile-time bombs, for E16G3 */
+ BUILD_BUG(NODES * sizeof(node_t) > 24 * 1024);
+ BUILD_BUG(NODES_X < 3 || NODES_Y < 3 || NODES_Z < 3);
+ BUILD_BUG(CORES_X < 1 || CORES_Y < 1);
+ BUILD_BUG(CORES_X > 4 || CORES_Y > 4);
+
+ /* save mesh coordinates */
+ e_coords_from_coreid(e_get_coreid(), &col, &row);
+ core = row * CORES_X + col;
+
+ /* initialize barrier */
+ e_barrier_init(barriers, tgt_bars);
+
+ /* initialize lattice */
+ init(*block);
+
+ /* main loop */
+ for(int iter = 0; iter < 3000; iter++) {
+#if 1
+ /* collide all nodes */
+ for(int z = 0; z < NODES_Z; z++)
+ for(int y = 0; y < NODES_Y; y++)
+ for(int x = 0; x < NODES_X; x++)
+ collide(*block, x, y, z, omega);
+
+ /* synchronize */
+ e_barrier(barriers, tgt_bars);
+
+ /* stream all nodes */
+ for(int z = 0; z < NODES_Z; z++)
+ for(int y = 0; y < NODES_Y; y++)
+ for(int x = 0; x < NODES_X; x++)
+ stream(*block, x, y, z);
+
+#else
+ /* collide boundaries: top, bottom */
+ for(int x = 0; x < BLOCKS_X; x++) {
+ d2q9_collide(*block, x, 0, omega);
+ d2q9_collide(*block, x, BLOCKS_Y-1, omega);
+ }
+
+ /* collide boundaries: left, right */
+ for(int y = 1; y < BLOCKS_Y-1; y++) {
+ d2q9_collide(*block, 0, y, omega);
+ d2q9_collide(*block, BLOCKS_X-1, y, omega);
+ }
+
+ /* synchronize */
+ e_barrier(barriers, tgt_bars);
+
+ /* collide and stream the bulk */
+ d2q9_collide_stream_bulk(*block, omega);
+
+ /* stream the boundaries: top, bottom */
+ for(int x = 0; x < BLOCKS_X; x++) {
+ d2q9_stream(*block, x, 0 );
+ d2q9_stream(*block, x, BLOCKS_Y-1);
+ }
+
+ /* stream the boundaries: left, right */
+ for(int y = 1; y < BLOCKS_Y-1; y++) {
+ d2q9_stream(*block, 0, y);
+ d2q9_stream(*block, BLOCKS_X-1, y);
+ }
+#endif
+
+ /* copy data to shm if necessary */
+ if(!(iter % 3)) {
+ /* copy lattice to shm */
+ memcpy(&shm.lattice[row][col], block, sizeof(block_t));
+
+ /* copy times to shm */
+ memcpy(&shm.times[row][col], times, sizeof(times_t));
+
+ /* synchronize */
+ e_barrier(barriers, tgt_bars);
+
+ /* core 0: write counter and flag host; wait */
+ if(core == 0) {
+ shm.iteration = iter;
+ shm.pollflag = POLL_READY;
+ while(shm.pollflag == POLL_READY);
+ }
+ }
+
+ /* synchronize */
+ e_barrier(barriers, tgt_bars);
+ }
+
+ /* core 0: flag host, all iterations done */
+ if(core == 0) {
+ shm.pollflag = POLL_DONE;
+ }
+
+ /* stop core */
+ while(1) {
+ __asm__ volatile("idle");
+ }
+}
+
--- /dev/null
+/* Helper Functions to handle data (3D) */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <unistd.h>
+
+#include "../shared.h"
+
+/* fix file owner if run with sudo */
+void fixsudo(const char *filename)
+{
+ if(getenv("SUDO_UID") && getenv("SUDO_GID")) {
+ int uid = atoi(getenv("SUDO_UID"));
+ int gid = atoi(getenv("SUDO_GID"));
+ if(chown(filename, uid, gid)) {
+ perror("fixsudo/chown");
+ return;
+ }
+ }
+}
+
+#if 0
+/* write a (semi-) human-readable dump of the lattice */
+void write_populations(d2q9_block_t lattice[CORES_Y][CORES_X], int iter)
+{
+ FILE *file = fopen("populations.dat", "a");
+ if(!file) {
+ perror("write_populations/fopen");
+ return;
+ }
+
+ for(int cy = 0; cy < CORES_Y; cy++) {
+ for(int y = 0; y < BLOCKS_Y; y++) {
+ for(int cx = 0; cx < CORES_X; cx++) {
+ for(int x = 0; x < BLOCKS_X; x++) {
+ fprintf(file, "%3d: [%3d,%3d]: ",
+ iter,
+ cx * BLOCKS_X + x,
+ cy * BLOCKS_Y + y
+ );
+ for(int q = 0; q < 9; q++) {
+ fprintf(file, "%.5f ", lattice[cy][cx][y][x][q]);
+ }
+ fprintf(file, "\n");
+ }
+ }
+ }
+ }
+ fprintf(file, "\n");
+
+ /* close */
+ fclose(file);
+
+ return;
+}
+#endif
+
+/* write 8-bit grayscale PPM of particle density in a slice */
+void write_density(block_t f[CORES_Y][CORES_X], int iter)
+{
+ char name[32];
+ FLOAT min = 1.0, max = 0.0;
+ FLOAT rhos[LATTICE_Z][LATTICE_Y][LATTICE_X];
+
+ /* write file header */
+ snprintf(name, 32, "./tmp/i%06d.ppm", iter);
+ FILE *file = fopen(name, "wb");
+ if(!file) {
+ perror("write_density/fopen");
+ return;
+ }
+ fprintf(file, "P5\n%d %d\n%d\n", LATTICE_X, LATTICE_Z, 255);
+
+ /* calculate densities from grid */
+ for(int cy = 0; cy < CORES_Y; cy++)
+ for(int z = 0; z < NODES_Z; z++)
+ for(int y = 0; y < NODES_Y; y++)
+ for(int cx = 0; cx < CORES_X; cx++)
+ for(int x = 0; x < NODES_X; x++)
+ {
+ const int idx[3] = { cx * NODES_X + x, y, cy * NODES_Z + z};
+
+ FLOAT rho = 0;
+ for(int q = 0; q < 19; q++)
+ rho += f[cy][cx][z][y][x][q];
+
+ if(rho < min) min = rho;
+ if(rho > max) max = rho;
+ rhos[idx[2]][idx[1]][idx[0]] = rho;
+ }
+
+ /* write slice of grid */
+ for(int z = 0; z < LATTICE_Z; z++) {
+ for(int x = 0; x < LATTICE_X; x++) {
+ FLOAT value = rhos[z][NODES_Y/2][x];
+
+ unsigned char gray = ( 255 *
+ (value - min) / (max - min) );
+ fwrite(&gray, 1, 1, file);
+ }
+ }
+
+ /* close the file and fix permissions */
+ fclose(file);
+ fixsudo(name);
+
+ return;
+}
+
+/* write 8-bit grayscale PPM of absolute particle velocity in a slice */
+void write_velocity(block_t f[CORES_Y][CORES_X], int iter)
+{
+ char name[32];
+ FLOAT min = 1.0, max = 0.0;
+ FLOAT vs[LATTICE_Z][LATTICE_Y][LATTICE_X];
+
+ /* write file header */
+ snprintf(name, 32, "./tmp/i%06d.ppm", iter);
+ FILE *file = fopen(name, "wb");
+ if(!file) {
+ perror("write_density/fopen");
+ return;
+ }
+ fprintf(file, "P5\n%d %d\n%d\n", LATTICE_X, LATTICE_Z, 255);
+
+ /* calculate velocities from grid */
+ for(int cy = 0; cy < CORES_Y; cy++)
+ for(int z = 0; z < NODES_Z; z++)
+ for(int y = 0; y < NODES_Y; y++)
+ for(int cx = 0; cx < CORES_X; cx++)
+ for(int x = 0; x < NODES_X; x++)
+ {
+ const int idx[3] = { cx * NODES_X + x, y, cy * NODES_Z + z};
+
+ FLOAT rho = 0;
+ for(int q = 0; q < 19; q++)
+ rho += f[cy][cx][z][y][x][q];
+
+ FLOAT ux = (
+ f[cy][cx][z][y][x][10] + f[cy][cx][z][y][x][13] +
+ f[cy][cx][z][y][x][14] + f[cy][cx][z][y][x][15] +
+ f[cy][cx][z][y][x][16] - f[cy][cx][z][y][x][1] -
+ f[cy][cx][z][y][x][4] - f[cy][cx][z][y][x][5] -
+ f[cy][cx][z][y][x][6] - f[cy][cx][z][y][x][7]
+ ) / rho;
+
+ FLOAT uy = (
+ f[cy][cx][z][y][x][5] + f[cy][cx][z][y][x][11] +
+ f[cy][cx][z][y][x][13] + f[cy][cx][z][y][x][17] +
+ f[cy][cx][z][y][x][18] - f[cy][cx][z][y][x][2] -
+ f[cy][cx][z][y][x][4] - f[cy][cx][z][y][x][8] -
+ f[cy][cx][z][y][x][9] - f[cy][cx][z][y][x][14]
+ ) / rho;
+
+ FLOAT uz = (
+ f[cy][cx][z][y][x][7] + f[cy][cx][z][y][x][9] +
+ f[cy][cx][z][y][x][12] + f[cy][cx][z][y][x][15] +
+ f[cy][cx][z][y][x][17] - f[cy][cx][z][y][x][3] -
+ f[cy][cx][z][y][x][6] - f[cy][cx][z][y][x][8] -
+ f[cy][cx][z][y][x][16] - f[cy][cx][z][y][x][18]
+ ) / rho;
+
+ FLOAT u = sqrtf(ux*ux + uy*uy + uz*uz);
+ if(u < min) min = u;
+ if(u > max) max = u;
+ vs[idx[2]][idx[1]][idx[0]] = u;
+ }
+
+ /* write slice of grid */
+ for(int z = 0; z < LATTICE_Z; z++) {
+ for(int x = 0; x < LATTICE_X; x++) {
+ FLOAT value = vs[z][NODES_Y/2][x];
+
+ unsigned char gray = ( 255 *
+ (value - min) / (max - min) );
+ fwrite(&gray, 1, 1, file);
+ }
+ }
+
+ /* close the file and fix permissions */
+ fclose(file);
+ fixsudo(name);
+
+ return;
+}
+
+/* convert image files to animated gif ./tmp/anim.gif */
+void convert_to_gif(void)
+{
+ int result;
+
+ /* call imagemagick */
+ result = system("convert ./tmp/i*.ppm ./tmp/anim.gif"); (void)result;
+ fixsudo("./tmp/anim.gif");
+
+ return;
+}
+
+/* convert image files to mp4 ./tmp/anim.mp4 */
+void convert_to_mp4(void)
+{
+ int result;
+
+ /* call ffmpeg
+ FIXME: requires ffmpeg binary with '-pattern_type glob' support,
+ so this will not work on the Parallella without updates! */
+ result = system("ffmpeg -v quiet -pattern_type glob -i "
+ "'./tmp/i*.ppm' ./tmp/anim.mp4"); (void)result;
+ fixsudo("./tmp/anim.mp4");
+
+ return;
+}
+
+/* write timer values */
+void write_timers(times_t t[CORES_Y][CORES_X], uint32_t iter)
+{
+ FILE *file = fopen("timers.dat", "ab");
+ if(!file) {
+ perror("write_timers/fopen");
+ return;
+ }
+
+ fprintf(file, "Iteration: i=%d\n", iter);
+ for(int y = 0; y < CORES_Y; y++) {
+ for(int x = 0; x < CORES_X; x++) {
+ fprintf(file, "[%d,%d]: ", x, y);
+ for(int i = 0; i < TIMERS; i++) {
+ fprintf(file, "%8d ", t[y][x][i]);
+ }
+ fprintf(file, "\n");
+ }
+ }
+
+ fclose(file);
+}
+
--- /dev/null
+/* Host Application */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <errno.h>
+#include <unistd.h>
+
+#include <e-hal.h>
+#include "../shared.h"
+
+#define FAIL(...) { fprintf(stderr, __VA_ARGS__); exit(1); }
+#define SHM_OFFSET 0x01000000
+
+/* helper functions */
+void fixsudo(const char *filename);
+void write_populations(block_t f[CORES_Y][CORES_X], int iter);
+void write_density (block_t f[CORES_Y][CORES_X], int iter);
+void write_velocity (block_t f[CORES_Y][CORES_X], int iter);
+void convert_to_gif(void);
+void convert_to_mp4(void);
+void write_timers(times_t t[CORES_Y][CORES_X], int iter);
+
+/* globals */
+static shm_t shm = { 0 }; /* local shm copy */
+static uint32_t pollflag;
+
+int main()
+{
+ char *filename = "bin/main.srec";
+
+ /* remove old results */
+ int dummy = system("rm -f ./tmp/i*.ppm ./tmp/anim.gif ./tmp/anim.mp4 populations.dat timers.dat");
+ (void)dummy;
+
+ e_epiphany_t dev;
+ e_mem_t mem;
+
+ e_set_host_verbosity(H_D0);
+ e_set_loader_verbosity(L_D0);
+
+ /* initialize workgroup, allocate and clear shared memory */
+ if(e_init(NULL) != E_OK)
+ FAIL("Can't init!\n");
+ e_reset_system();
+ if(e_open(&dev, 0, 0, CORES_X, CORES_Y) != E_OK)
+ FAIL("Can't open!\n");
+ if(e_alloc(&mem, SHM_OFFSET, sizeof(shm_t)) != E_OK)
+ FAIL("Can't alloc!\n");
+ if(e_write(&mem, 0, 0, (off_t)0, &shm, sizeof(shm_t)) == E_ERR)
+ FAIL("Can't clear shm!\n");
+
+ /* load programs */
+ printf("Starting cores:\n");
+ for(int y = 0; y < CORES_Y; y++) {
+ for(int x = 0; x < CORES_X; x++) {
+ printf("(%02d,%02d) ", x, y);
+ if(e_load(filename, &dev, x, y, E_TRUE) != E_OK)
+ FAIL("Can't load!\n");
+ }
+ printf("\n");
+ }
+
+ /* ================================================================ */
+ printf("Polling shared memory.\n");
+ while(1) {
+
+ while(1) {
+ /* read polling flag */
+ if(e_read(&mem, 0, 0, (off_t)0, &pollflag,
+ sizeof(uint32_t)) == E_ERR)
+ FAIL("Can't read pollflag!\n");
+
+ /* wait for something */
+ if(pollflag != POLL_BUSY) break;
+ }
+
+ /* finish if done */
+ if(pollflag == POLL_DONE) break;
+
+ /* read full shared memory */
+ if(e_read(&mem, 0, 0, (off_t)0, &shm, sizeof(shm_t)) == E_ERR)
+ FAIL("Can't read full shm!\n");
+
+ /* reset pollflag */
+ pollflag = 0;
+ if(e_write(&mem, 0, 0, (off_t)0, &pollflag,
+ sizeof(uint32_t)) == E_ERR) {
+ FAIL("Can't reset pollflag!\n");
+ }
+
+ /* print iteration */
+ printf("0x%08x\r", shm.iteration); fflush(stdout);
+
+ /* write data */
+ //write_populations(shm.lattice, shm.iteration);
+ write_density(shm.lattice, shm.iteration);
+ //write_velocity(shm.lattice, shm.iteration);
+ write_timers(shm.times, shm.iteration);
+ }
+ /* ================================================================ */
+
+ if(e_free(&mem) != E_OK) FAIL("Can't free!\n");
+ if(e_close(&dev) != E_OK) FAIL("Can't close!\n");
+ if(e_finalize() != E_OK) FAIL("Can't finalize!\n");
+
+ fixsudo("populations.dat");
+ fixsudo("timers.dat");
+
+ printf("\nProgram finished successfully.\n");
+ printf("Convert ...\n");
+ convert_to_gif();
+
+ return(0);
+}
+
--- /dev/null
+/* shared data types and external memory layout */
+#ifndef _SHARED_H_
+#define _SHARED_H_
+
+#include <stdint.h>
+
+/* ================================================================== */
+
+/* number of cores */
+#define CORES_X 4
+#define CORES_Y 4
+
+/* number of nodes per core */
+#define NODES_X 10
+#define NODES_Y 3
+#define NODES_Z 10
+
+/* ================================================================== */
+
+/* preprocessor magic */
+#define BUILD_BUG(c) do { ((void)sizeof(char[1 - 2*!!(c)])); } while(0);
+#define USED __attribute__((used))
+#undef PACKED
+#define PACKED __attribute__((packed))
+#undef ALIGN
+#define ALIGN(X) __attribute__((aligned(X)))
+
+/* some calculations */
+#define CORES (CORES_X * CORES_Y)
+#define NODES (NODES_X * NODES_Y * NODES_Z)
+#define LATTICE_X (NODES_X * CORES_X)
+#define LATTICE_Y (NODES_Y)
+#define LATTICE_Z (NODES_Z * CORES_Y)
+
+/* number of timer values */
+#define TIMERS 12
+
+/* pollflag values */
+#define POLL_BUSY 0x00
+#define POLL_READY 0x01
+#define POLL_DONE 0x02
+
+/* data types */
+typedef float FLOAT;
+typedef FLOAT node_t[19];
+typedef node_t block_t[NODES_Z][NODES_Y][NODES_X];
+typedef uint32_t times_t[TIMERS];
+
+/* shared memory structure */
+typedef struct {
+ uint32_t pollflag;
+ uint32_t iteration;
+ times_t times[CORES_Y][CORES_X];
+ block_t lattice[CORES_Y][CORES_X];
+} ALIGN(8) shm_t;
+
+#endif /* _SHARED_H_ */
+