From 9fcf834c54dddc00c8c233a73d6782bf5294c7e0 Mon Sep 17 00:00:00 2001 From: Sebastian Date: Mon, 1 Sep 2014 19:28:36 +0000 Subject: [PATCH] d3q19: initial upload (no Zou/He) Will not compile with Zou/He boundaries due to linker error. The code does not fit in Bank 0, even with -Os. --- d3q19/Makefile | 105 ++++++++++++++++++ d3q19/esrc/d3q19.c | 262 +++++++++++++++++++++++++++++++++++++++++++++ d3q19/esrc/d3q19.h | 9 ++ d3q19/esrc/main.c | 139 ++++++++++++++++++++++++ d3q19/hsrc/data.c | 237 ++++++++++++++++++++++++++++++++++++++++ d3q19/hsrc/main.c | 117 ++++++++++++++++++++ d3q19/shared.h | 58 ++++++++++ 7 files changed, 927 insertions(+) create mode 100644 d3q19/Makefile create mode 100644 d3q19/esrc/d3q19.c create mode 100644 d3q19/esrc/d3q19.h create mode 100644 d3q19/esrc/main.c create mode 100644 d3q19/hsrc/data.c create mode 100644 d3q19/hsrc/main.c create mode 100644 d3q19/shared.h diff --git a/d3q19/Makefile b/d3q19/Makefile new file mode 100644 index 0000000..5674c3d --- /dev/null +++ b/d3q19/Makefile @@ -0,0 +1,105 @@ +# Template Makefile for Epiphany + +# host toolchain +HCC = gcc +HCFLAGS = -O2 -std=c99 -I$(EPIPHANY_HOME)/tools/host/include -Wall +HLFLAGS = -lm -L$(EPIPHANY_HOME)/tools/host/lib -le-hal +ECHO = /bin/echo -e + +# target toolchain +ECC = e-gcc +EOC = e-objcopy +ECFLAGS = -Os -std=c99 -falign-loops=8 -falign-functions=8 -Wall -fsingle-precision-constant -ffast-math +ELFLAGS = -T$(EPIPHANY_HOME)/bsps/current/internal.ldf -le-lib +EOFLAGS = -R .shared_dram -R .data_bank1 -R .data_bank2 -R .data_bank3 + +# host application +HAPP = $(DEST)/ep_main +HOBJS = $(HDEST)/main.o $(HDEST)/data.o + +# epiphany applications +EAPPS = $(DEST)/main.srec +ECOMMON = $(EDEST)/d3q19.o + +# folders +HSRC = hsrc +HDEST = hobj +ESRC = esrc +EDEST = eobj +DEST = bin + +# === Magic begins here =================================================== +EOBJS = $(EAPPS:$(DEST)%srec=$(EDEST)%o) $(ECOMMON) +EELFS = $(EAPPS:$(DEST)%srec=$(EDEST)%elf) + +.SECONDARY: +.PHONY: all help host target folders run clean +.NOTPARALLEL: clean + +# === Phony Rules ========================================================= +help: + @$(ECHO) + @$(ECHO) "Epiphany Makefile - Help" + @$(ECHO) " help show this help" + @$(ECHO) " host build host application ($(HAPP))" + @$(ECHO) " target build epiphany applications ($(EAPPS))" + @$(ECHO) " all build all" + @$(ECHO) " run build all, then run host application" + @$(ECHO) " clean remove applications and intermediate files" + @$(ECHO) + +all: host target + +host: folders $(HAPP) + +target: folders $(EAPPS) + +folders: $(HDEST) $(EDEST) $(DEST) + +run: host target + @$(ECHO) "\tRUN" + @sudo LD_LIBRARY_PATH=$(LD_LIBRARY_PATH) \ + EPIPHANY_HDF=$(EPIPHANY_HDF) \ + $(HAPP) + +clean: + @$(ECHO) "\tCLEAN" + @rm -v -f $(HAPP) $(HOBJS) $(EAPPS) $(EELFS) $(EOBJS) + @-rmdir -v --ignore-fail-on-non-empty $(HDEST) $(EDEST) $(DEST) \ + 2>/dev/null + +$(HDEST): + @$(ECHO) "\t(HOST) MKDIR $(HDEST)" + @mkdir -p $(HDEST) + +$(EDEST): + @$(ECHO) "\t(HOST) MKDIR $(EDEST)" + @mkdir -p $(EDEST) + +$(DEST): + @$(ECHO) "\t(HOST) MKDIR $(DEST)" + @mkdir -p $(DEST) + +# === Host Rules ========================================================== +$(HAPP): $(HOBJS) + @$(ECHO) "\t(HOST) LINK\t$@" + @$(HCC) -o $@ $^ $(HLFLAGS) + +$(HDEST)/%.o: $(HSRC)/%.c + @$(ECHO) "\t(HOST) CC\t$@" + @$(HCC) $(HCFLAGS) -c -o $@ $^ + +# === Target Rules ======================================================== +$(DEST)/%.srec: $(EDEST)/%.elf + @$(ECHO) "\t(TARGET) OBJCOPY $@" + @$(EOC) $(EOFLAGS) --output-target srec --srec-forceS3 $^ $@ + +$(EDEST)/%.elf: $(EDEST)/%.o $(ECOMMON) + @$(ECHO) "\t(TARGET) LINK\t$@" + @$(ECC) -o $@ $^ $(ELFLAGS) + +$(EDEST)/%.o: $(ESRC)/%.c + @$(ECHO) "\t(TARGET) CC\t$@" + @$(ECC) $(ECFLAGS) -c -o $@ $^ +# ========================================================================= + diff --git a/d3q19/esrc/d3q19.c b/d3q19/esrc/d3q19.c new file mode 100644 index 0000000..d50c11b --- /dev/null +++ b/d3q19/esrc/d3q19.c @@ -0,0 +1,262 @@ +/* Lattice Boltzmann Algorithm Implementation (D3Q19) */ + +#include +#include "../shared.h" + +/* core indices */ +extern unsigned int row, col, core; + +/* ================================================================== */ + +static const int d3q19_v[19][3] = { { 0, 0, 0}, + {-1, 0, 0}, { 0,-1, 0}, { 0, 0,-1}, {-1,-1, 0}, {-1, 1, 0}, {-1, 0,-1}, + {-1, 0, 1}, { 0,-1,-1}, { 0,-1, 1}, { 1, 0, 0}, { 0, 1, 0}, { 0, 0, 1}, + { 1, 1, 0}, { 1,-1, 0}, { 1, 0, 1}, { 1, 0,-1}, { 0, 1, 1}, { 0, 1,-1}, +}; +static const FLOAT d3q19_w[19] = { 1./3., + 1./18., 1./18., 1./18., 1./36., 1./36., 1./36., + 1./36., 1./36., 1./36., 1./18., 1./18., 1./18., + 1./36., 1./36., 1./36., 1./36., 1./36., 1./36., +}; + +/* ================================================================== */ + +void init(block_t f) +{ + /* initialize all nodes with rho = 0.1 */ + for(int z = 0; z < NODES_Z; z++) + for(int y = 0; y < NODES_Y; y++) + for(int x = 0; x < NODES_X; x++) + for(int q = 0; q < 19; q++) + f[z][y][x][q] = 0.1 * d3q19_w[q]; + +#if 1 + /* except here */ + if(core == 0) + for(int q = 0; q < 19; q++) + f[1][1][1][q] = 0.2 * d3q19_w[q]; +#endif +} + +/* ================================================================== */ + +void collide(block_t f, int x, int y, int z, FLOAT omega) +{ +#if 0 + /* Zou/He boundary conditions */ + if(row == 0 && z == 0) { /* top */ + const FLOAT Ux = 0.0; /* wall speed */ + const FLOAT Uy = 0.2; + const FLOAT Uz = 0.0; + + FLOAT rho = ( f[z][y][x][0] + f[z][y][x][1] + f[z][y][x][2] + + f[z][y][x][4] + f[z][y][x][5] + f[z][y][x][10] + + f[z][y][x][11] + f[z][y][x][13] + f[z][y][x][14] + + 2 * ( f[z][y][x][16] + f[z][y][x][18] + f[z][y][x][3] + + f[z][y][x][6] + f[z][y][x][8] ) / (1 + Uz) ); + + FLOAT tmp0 = rho / 2; + FLOAT tmp1 = ( f[z][y][x][10] - f[z][y][x][1] ) / 2; + FLOAT tmp2 = ( f[z][y][x][11] - f[z][y][x][2] ) / 2; + FLOAT tmp3 = ( f[z][y][x][13] - f[z][y][x][4] ) / 2; + FLOAT tmp4 = ( f[z][y][x][14] - f[z][y][x][5] ) / 2; + + f[z][y][x][15] = f[z][y][x][6] - tmp1 - tmp3 - tmp4 - + tmp0 * (Uz/3 - Ux); + f[z][y][x][7] = f[z][y][x][16] + tmp1 + tmp3 + tmp4 - + tmp0 * (Uz/3 + Ux); + f[z][y][x][17] = f[z][y][x][8] - tmp2 - tmp3 + tmp4 - + tmp0 * (Uz/3 - Uy); + f[z][y][x][9] = f[z][y][x][18] + tmp2 + tmp3 - tmp4 - + tmp0 * (Uz/3 + Uy); + f[z][y][x][12] = f[z][y][x][3] - rho * Uz / 3; + + } else if(row == CORES_Y-1 && z == NODES_Z-1) { /* bottom */ + FLOAT tmp1 = ( f[z][y][x][10] - f[z][y][x][1] ) / 2; + FLOAT tmp2 = ( f[z][y][x][11] - f[z][y][x][2] ) / 2; + FLOAT tmp3 = ( f[z][y][x][13] - f[z][y][x][4] ) / 2; + FLOAT tmp4 = ( f[z][y][x][14] - f[z][y][x][5] ) / 2; + + f[z][y][x][6] = f[z][y][x][15] + tmp1 + tmp3 + tmp4; + f[z][y][x][16] = f[z][y][x][7] - tmp1 - tmp3 - tmp4; + f[z][y][x][8] = f[z][y][x][17] + tmp2 + tmp3 - tmp4; + f[z][y][x][18] = f[z][y][x][9] - tmp2 - tmp3 + tmp4; + f[z][y][x][3] = f[z][y][x][12]; + + } else if(y == 0) { /* front */ + FLOAT tmp1 = ( f[z][y][x][10] - f[z][y][x][1] ) / 2; + FLOAT tmp2 = ( f[z][y][x][12] - f[z][y][x][3] ) / 2; + FLOAT tmp3 = ( f[z][y][x][15] - f[z][y][x][6] ) / 2; + FLOAT tmp4 = ( f[z][y][x][16] - f[z][y][x][7] ) / 2; + + f[z][y][x][13] = f[z][y][x][4] - tmp1 - tmp3 - tmp4; + f[z][y][x][5] = f[z][y][x][14] + tmp1 + tmp3 + tmp4; + f[z][y][x][17] = f[z][y][x][8] - tmp2 - tmp3 - tmp4; + f[z][y][x][18] = f[z][y][x][9] + tmp2 + tmp3 + tmp4; + f[z][y][x][11] = f[z][y][x][2]; + + } else if(y == NODES_Y-1) { /* back */ + FLOAT tmp1 = ( f[z][y][x][10] - f[z][y][x][1] ) / 2; + FLOAT tmp2 = ( f[z][y][x][12] - f[z][y][x][3] ) / 2; + FLOAT tmp3 = ( f[z][y][x][15] - f[z][y][x][6] ) / 2; + FLOAT tmp4 = ( f[z][y][x][16] - f[z][y][x][7] ) / 2; + + f[z][y][x][4] = f[z][y][x][13] + tmp1 + tmp3 + tmp4; + f[z][y][x][14] = f[z][y][x][5] - tmp1 - tmp3 - tmp4; + f[z][y][x][8] = f[z][y][x][17] + tmp2 + tmp3 + tmp4; + f[z][y][x][9] = f[z][y][x][18] - tmp2 - tmp3 - tmp4; + f[z][y][x][2] = f[z][y][x][11]; + + } else if(col == 0 && x == 0) { /* left */ + FLOAT tmp1 = ( f[z][y][x][11] - f[z][y][x][2] ) / 2; + FLOAT tmp2 = ( f[z][y][x][12] - f[z][y][x][3] ) / 2; + FLOAT tmp3 = ( f[z][y][x][17] - f[z][y][x][8] ) / 2; + FLOAT tmp4 = ( f[z][y][x][18] - f[z][y][x][9] ) / 2; + + f[z][y][x][13] = f[z][y][x][4] - tmp1 - tmp3 - tmp4; + f[z][y][x][14] = f[z][y][x][5] + tmp1 + tmp3 + tmp4; + f[z][y][x][15] = f[z][y][x][6] - tmp2 - tmp3 + tmp4; + f[z][y][x][16] = f[z][y][x][7] + tmp2 + tmp3 - tmp4; + f[z][y][x][10] = f[z][y][x][1]; + + } else if(col == CORES_X-1 && x == NODES_X-1) { /* right */ + FLOAT tmp1 = ( f[z][y][x][11] - f[z][y][x][2] ) / 2; + FLOAT tmp2 = ( f[z][y][x][12] - f[z][y][x][3] ) / 2; + FLOAT tmp3 = ( f[z][y][x][17] - f[z][y][x][8] ) / 2; + FLOAT tmp4 = ( f[z][y][x][18] - f[z][y][x][9] ) / 2; + + f[z][y][x][4] = f[z][y][x][13] + tmp1 + tmp3 + tmp4; + f[z][y][x][5] = f[z][y][x][14] - tmp1 - tmp3 - tmp4; + f[z][y][x][6] = f[z][y][x][15] + tmp2 + tmp3 - tmp4; + f[z][y][x][7] = f[z][y][x][16] - tmp2 - tmp3 + tmp4; + f[z][y][x][1] = f[z][y][x][10]; + } +#endif + + /* macroscopic */ + FLOAT rho = + f[z][y][x][0] + f[z][y][x][1] + f[z][y][x][2] + + f[z][y][x][3] + f[z][y][x][4] + f[z][y][x][5] + + f[z][y][x][6] + f[z][y][x][7] + f[z][y][x][8] + + f[z][y][x][9] + f[z][y][x][10] + f[z][y][x][11] + + f[z][y][x][12] + f[z][y][x][13] + f[z][y][x][14] + + f[z][y][x][15] + f[z][y][x][16] + f[z][y][x][17] + + f[z][y][x][18]; + + FLOAT ux = ( + f[z][y][x][10] + f[z][y][x][13] + f[z][y][x][14] + + f[z][y][x][15] + f[z][y][x][16] - f[z][y][x][1] - + f[z][y][x][4] - f[z][y][x][5] - f[z][y][x][6] - + f[z][y][x][7] ) / rho; + + FLOAT uy = ( + f[z][y][x][5] + f[z][y][x][11] + f[z][y][x][13] + + f[z][y][x][17] + f[z][y][x][18] - f[z][y][x][2] - + f[z][y][x][4] - f[z][y][x][8] - f[z][y][x][9] - + f[z][y][x][14] ) / rho; + + FLOAT uz = ( + f[z][y][x][7] + f[z][y][x][9] + f[z][y][x][12] + + f[z][y][x][15] + f[z][y][x][17] - f[z][y][x][3] - + f[z][y][x][6] - f[z][y][x][8] - f[z][y][x][16] - + f[z][y][x][18] ) / rho; + + FLOAT sqr = 1.5 * (ux*ux + uy*uy + uz*uz); + + /* update node */ + for(int q = 0; q < 19; q++) { + FLOAT cu = + ux*d3q19_v[q][0] + uy*d3q19_v[q][1] + uz*d3q19_v[q][2]; + FLOAT eq = rho * d3q19_w[q] * + (1. + 3. * cu + 4.5 * cu*cu - sqr); + f[z][y][x][q] *= (1.0 - omega); + f[z][y][x][q] += omega * eq; + } + + /* swap */ + for(int q = 1; q <= 9; q++) { + FLOAT tmp = f[z][y][x][q]; + f[z][y][x][q] = f[z][y][x][q+9]; + f[z][y][x][q+9] = tmp; + } +} + +/* ================================================================== */ + +void stream(block_t f, int x, int y, int z) +{ + for(int q = 1; q <= 9; q++) { + int next_row = row; + int next_col = col; + int next_x = x + d3q19_v[q][0]; + int next_y = y + d3q19_v[q][1]; + int next_z = z + d3q19_v[q][2]; + + /* inner borders extend in xz-plane */ + if(next_x < 0) { next_col--; next_x += NODES_X; } + if(next_x >= NODES_X) { next_col++; next_x -= NODES_X; } + if(next_z < 0) { next_row--; next_z += NODES_Z; } + if(next_z >= NODES_Z) { next_row++; next_z -= NODES_Z; } + + /* outer borders are not streamed */ + if(next_col < 0) { continue; } /* -x */ + else if(next_col >= CORES_X) { continue; } /* +x */ + if(next_y < 0) { continue; } /* -y */ + else if(next_y >= NODES_Y) { continue; } /* +y */ + if(next_row < 0) { continue; } /* -z */ + else if(next_row >= CORES_Y) { continue; } /* +z */ + + /* f: local block, g: local/remote block */ + block_t *g = (void*)f; + if(next_row != row || next_col != col) { + g = e_get_global_address(next_col, next_row, (void*)f); + } + + /* stream by swapping */ + FLOAT tmp = f[z][y][x][q+9]; + f[z][y][x][q+9] = (*g)[next_z][next_y][next_x][q]; + (*g)[next_z][next_y][next_x][q] = tmp; + } +} + +/* ================================================================== */ + +#if 0 +void d2q9_collide_stream_bulk(d2q9_block_t f, FLOAT omega) +{ + /* don't touch the border nodes */ + for(int x = 1; x < BLOCKS_X-1; x++) { + for(int y = 1; y < BLOCKS_Y-1; y++) { + /* macroscopic */ + FLOAT rho = f[y][x][0] + f[y][x][1] + f[y][x][2] + + f[y][x][3] + f[y][x][4] + f[y][x][5] + + f[y][x][6] + f[y][x][7] + f[y][x][8]; + FLOAT ux = (f[y][x][7] + f[y][x][6] + f[y][x][5] - + f[y][x][1] - f[y][x][2] - f[y][x][3]) / rho; + FLOAT uy = (f[y][x][1] + f[y][x][8] + f[y][x][7] - + f[y][x][3] - f[y][x][4] - f[y][x][5]) / rho; + FLOAT sqr = 1.5 * (ux*ux + uy*uy); + + /* update node */ + for(int q = 0; q < 9; q++) { + FLOAT cu = ux*d2q9_v[q][0] + uy*d2q9_v[q][1]; + FLOAT eq = rho * d2q9_w[q] * + (1. + 3. * cu + 4.5 * cu*cu - sqr); + f[y][x][q] *= (1.0 - omega); + f[y][x][q] += omega * eq; + } + + /* stream */ + for(int q = 0; q <= 4; q++) { + int next_x = x + d2q9_v[q][0]; + int next_y = y + d2q9_v[q][1]; + + FLOAT tmp = f[y][x][q]; + f[y][x][q] = f[y][x][q+4]; + f[y][x][q+4] = f[next_y][next_x][q]; + f[next_y][next_x][q] = tmp; + } + } + } +} +#endif diff --git a/d3q19/esrc/d3q19.h b/d3q19/esrc/d3q19.h new file mode 100644 index 0000000..3b8f57a --- /dev/null +++ b/d3q19/esrc/d3q19.h @@ -0,0 +1,9 @@ +/* Lattice Boltzmann Algorithm Header */ + +#include "../shared.h" + +void init (block_t); +void collide (block_t, int x, int y, int z, FLOAT); +void stream (block_t, int x, int y, int z); +void collide_stream_bulk(block_t, FLOAT); + diff --git a/d3q19/esrc/main.c b/d3q19/esrc/main.c new file mode 100644 index 0000000..625fb67 --- /dev/null +++ b/d3q19/esrc/main.c @@ -0,0 +1,139 @@ +/* Lattice Boltzmann Implementation */ + +#include +#include + +#include + +#include "d3q19.h" +#include "../shared.h" + +/* shared memory overlay */ +volatile shm_t shm SECTION(".shared_dram"); + +/* statically allocate dummy memory and local block overlay + to prevent linker from putting stuff in banks 1..3 */ +static uint8_t dummy_bank1[8192] USED SECTION(".data_bank1"); +static uint8_t dummy_bank2[8192] USED SECTION(".data_bank2"); +static uint8_t dummy_bank3[8192] USED SECTION(".data_bank3"); +static block_t *block = (void*)0x2000; + +/* barrier structures */ +volatile e_barrier_t barriers[CORES]; + e_barrier_t *tgt_bars[CORES]; + +/* global index variables */ +unsigned int row, col, core; + +#define READ_TIMER(X) \ + do { \ + times[X] = E_CTIMER_MAX - e_ctimer_stop(E_CTIMER_0); \ + e_ctimer_set(E_CTIMER_0, E_CTIMER_MAX); \ + e_ctimer_start(E_CTIMER_0, E_CTIMER_CLK); \ + } while(0); + +int main() +{ + const FLOAT omega = 1.0; + unsigned times[TIMERS] = {0}; + + /* compile-time bombs, for E16G3 */ + BUILD_BUG(NODES * sizeof(node_t) > 24 * 1024); + BUILD_BUG(NODES_X < 3 || NODES_Y < 3 || NODES_Z < 3); + BUILD_BUG(CORES_X < 1 || CORES_Y < 1); + BUILD_BUG(CORES_X > 4 || CORES_Y > 4); + + /* save mesh coordinates */ + e_coords_from_coreid(e_get_coreid(), &col, &row); + core = row * CORES_X + col; + + /* initialize barrier */ + e_barrier_init(barriers, tgt_bars); + + /* initialize lattice */ + init(*block); + + /* main loop */ + for(int iter = 0; iter < 3000; iter++) { +#if 1 + /* collide all nodes */ + for(int z = 0; z < NODES_Z; z++) + for(int y = 0; y < NODES_Y; y++) + for(int x = 0; x < NODES_X; x++) + collide(*block, x, y, z, omega); + + /* synchronize */ + e_barrier(barriers, tgt_bars); + + /* stream all nodes */ + for(int z = 0; z < NODES_Z; z++) + for(int y = 0; y < NODES_Y; y++) + for(int x = 0; x < NODES_X; x++) + stream(*block, x, y, z); + +#else + /* collide boundaries: top, bottom */ + for(int x = 0; x < BLOCKS_X; x++) { + d2q9_collide(*block, x, 0, omega); + d2q9_collide(*block, x, BLOCKS_Y-1, omega); + } + + /* collide boundaries: left, right */ + for(int y = 1; y < BLOCKS_Y-1; y++) { + d2q9_collide(*block, 0, y, omega); + d2q9_collide(*block, BLOCKS_X-1, y, omega); + } + + /* synchronize */ + e_barrier(barriers, tgt_bars); + + /* collide and stream the bulk */ + d2q9_collide_stream_bulk(*block, omega); + + /* stream the boundaries: top, bottom */ + for(int x = 0; x < BLOCKS_X; x++) { + d2q9_stream(*block, x, 0 ); + d2q9_stream(*block, x, BLOCKS_Y-1); + } + + /* stream the boundaries: left, right */ + for(int y = 1; y < BLOCKS_Y-1; y++) { + d2q9_stream(*block, 0, y); + d2q9_stream(*block, BLOCKS_X-1, y); + } +#endif + + /* copy data to shm if necessary */ + if(!(iter % 3)) { + /* copy lattice to shm */ + memcpy(&shm.lattice[row][col], block, sizeof(block_t)); + + /* copy times to shm */ + memcpy(&shm.times[row][col], times, sizeof(times_t)); + + /* synchronize */ + e_barrier(barriers, tgt_bars); + + /* core 0: write counter and flag host; wait */ + if(core == 0) { + shm.iteration = iter; + shm.pollflag = POLL_READY; + while(shm.pollflag == POLL_READY); + } + } + + /* synchronize */ + e_barrier(barriers, tgt_bars); + } + + /* core 0: flag host, all iterations done */ + if(core == 0) { + shm.pollflag = POLL_DONE; + } + + /* stop core */ + while(1) { + __asm__ volatile("idle"); + } +} + diff --git a/d3q19/hsrc/data.c b/d3q19/hsrc/data.c new file mode 100644 index 0000000..bf20c45 --- /dev/null +++ b/d3q19/hsrc/data.c @@ -0,0 +1,237 @@ +/* Helper Functions to handle data (3D) */ + +#include +#include +#include +#include + +#include "../shared.h" + +/* fix file owner if run with sudo */ +void fixsudo(const char *filename) +{ + if(getenv("SUDO_UID") && getenv("SUDO_GID")) { + int uid = atoi(getenv("SUDO_UID")); + int gid = atoi(getenv("SUDO_GID")); + if(chown(filename, uid, gid)) { + perror("fixsudo/chown"); + return; + } + } +} + +#if 0 +/* write a (semi-) human-readable dump of the lattice */ +void write_populations(d2q9_block_t lattice[CORES_Y][CORES_X], int iter) +{ + FILE *file = fopen("populations.dat", "a"); + if(!file) { + perror("write_populations/fopen"); + return; + } + + for(int cy = 0; cy < CORES_Y; cy++) { + for(int y = 0; y < BLOCKS_Y; y++) { + for(int cx = 0; cx < CORES_X; cx++) { + for(int x = 0; x < BLOCKS_X; x++) { + fprintf(file, "%3d: [%3d,%3d]: ", + iter, + cx * BLOCKS_X + x, + cy * BLOCKS_Y + y + ); + for(int q = 0; q < 9; q++) { + fprintf(file, "%.5f ", lattice[cy][cx][y][x][q]); + } + fprintf(file, "\n"); + } + } + } + } + fprintf(file, "\n"); + + /* close */ + fclose(file); + + return; +} +#endif + +/* write 8-bit grayscale PPM of particle density in a slice */ +void write_density(block_t f[CORES_Y][CORES_X], int iter) +{ + char name[32]; + FLOAT min = 1.0, max = 0.0; + FLOAT rhos[LATTICE_Z][LATTICE_Y][LATTICE_X]; + + /* write file header */ + snprintf(name, 32, "./tmp/i%06d.ppm", iter); + FILE *file = fopen(name, "wb"); + if(!file) { + perror("write_density/fopen"); + return; + } + fprintf(file, "P5\n%d %d\n%d\n", LATTICE_X, LATTICE_Z, 255); + + /* calculate densities from grid */ + for(int cy = 0; cy < CORES_Y; cy++) + for(int z = 0; z < NODES_Z; z++) + for(int y = 0; y < NODES_Y; y++) + for(int cx = 0; cx < CORES_X; cx++) + for(int x = 0; x < NODES_X; x++) + { + const int idx[3] = { cx * NODES_X + x, y, cy * NODES_Z + z}; + + FLOAT rho = 0; + for(int q = 0; q < 19; q++) + rho += f[cy][cx][z][y][x][q]; + + if(rho < min) min = rho; + if(rho > max) max = rho; + rhos[idx[2]][idx[1]][idx[0]] = rho; + } + + /* write slice of grid */ + for(int z = 0; z < LATTICE_Z; z++) { + for(int x = 0; x < LATTICE_X; x++) { + FLOAT value = rhos[z][NODES_Y/2][x]; + + unsigned char gray = ( 255 * + (value - min) / (max - min) ); + fwrite(&gray, 1, 1, file); + } + } + + /* close the file and fix permissions */ + fclose(file); + fixsudo(name); + + return; +} + +/* write 8-bit grayscale PPM of absolute particle velocity in a slice */ +void write_velocity(block_t f[CORES_Y][CORES_X], int iter) +{ + char name[32]; + FLOAT min = 1.0, max = 0.0; + FLOAT vs[LATTICE_Z][LATTICE_Y][LATTICE_X]; + + /* write file header */ + snprintf(name, 32, "./tmp/i%06d.ppm", iter); + FILE *file = fopen(name, "wb"); + if(!file) { + perror("write_density/fopen"); + return; + } + fprintf(file, "P5\n%d %d\n%d\n", LATTICE_X, LATTICE_Z, 255); + + /* calculate velocities from grid */ + for(int cy = 0; cy < CORES_Y; cy++) + for(int z = 0; z < NODES_Z; z++) + for(int y = 0; y < NODES_Y; y++) + for(int cx = 0; cx < CORES_X; cx++) + for(int x = 0; x < NODES_X; x++) + { + const int idx[3] = { cx * NODES_X + x, y, cy * NODES_Z + z}; + + FLOAT rho = 0; + for(int q = 0; q < 19; q++) + rho += f[cy][cx][z][y][x][q]; + + FLOAT ux = ( + f[cy][cx][z][y][x][10] + f[cy][cx][z][y][x][13] + + f[cy][cx][z][y][x][14] + f[cy][cx][z][y][x][15] + + f[cy][cx][z][y][x][16] - f[cy][cx][z][y][x][1] - + f[cy][cx][z][y][x][4] - f[cy][cx][z][y][x][5] - + f[cy][cx][z][y][x][6] - f[cy][cx][z][y][x][7] + ) / rho; + + FLOAT uy = ( + f[cy][cx][z][y][x][5] + f[cy][cx][z][y][x][11] + + f[cy][cx][z][y][x][13] + f[cy][cx][z][y][x][17] + + f[cy][cx][z][y][x][18] - f[cy][cx][z][y][x][2] - + f[cy][cx][z][y][x][4] - f[cy][cx][z][y][x][8] - + f[cy][cx][z][y][x][9] - f[cy][cx][z][y][x][14] + ) / rho; + + FLOAT uz = ( + f[cy][cx][z][y][x][7] + f[cy][cx][z][y][x][9] + + f[cy][cx][z][y][x][12] + f[cy][cx][z][y][x][15] + + f[cy][cx][z][y][x][17] - f[cy][cx][z][y][x][3] - + f[cy][cx][z][y][x][6] - f[cy][cx][z][y][x][8] - + f[cy][cx][z][y][x][16] - f[cy][cx][z][y][x][18] + ) / rho; + + FLOAT u = sqrtf(ux*ux + uy*uy + uz*uz); + if(u < min) min = u; + if(u > max) max = u; + vs[idx[2]][idx[1]][idx[0]] = u; + } + + /* write slice of grid */ + for(int z = 0; z < LATTICE_Z; z++) { + for(int x = 0; x < LATTICE_X; x++) { + FLOAT value = vs[z][NODES_Y/2][x]; + + unsigned char gray = ( 255 * + (value - min) / (max - min) ); + fwrite(&gray, 1, 1, file); + } + } + + /* close the file and fix permissions */ + fclose(file); + fixsudo(name); + + return; +} + +/* convert image files to animated gif ./tmp/anim.gif */ +void convert_to_gif(void) +{ + int result; + + /* call imagemagick */ + result = system("convert ./tmp/i*.ppm ./tmp/anim.gif"); (void)result; + fixsudo("./tmp/anim.gif"); + + return; +} + +/* convert image files to mp4 ./tmp/anim.mp4 */ +void convert_to_mp4(void) +{ + int result; + + /* call ffmpeg + FIXME: requires ffmpeg binary with '-pattern_type glob' support, + so this will not work on the Parallella without updates! */ + result = system("ffmpeg -v quiet -pattern_type glob -i " + "'./tmp/i*.ppm' ./tmp/anim.mp4"); (void)result; + fixsudo("./tmp/anim.mp4"); + + return; +} + +/* write timer values */ +void write_timers(times_t t[CORES_Y][CORES_X], uint32_t iter) +{ + FILE *file = fopen("timers.dat", "ab"); + if(!file) { + perror("write_timers/fopen"); + return; + } + + fprintf(file, "Iteration: i=%d\n", iter); + for(int y = 0; y < CORES_Y; y++) { + for(int x = 0; x < CORES_X; x++) { + fprintf(file, "[%d,%d]: ", x, y); + for(int i = 0; i < TIMERS; i++) { + fprintf(file, "%8d ", t[y][x][i]); + } + fprintf(file, "\n"); + } + } + + fclose(file); +} + diff --git a/d3q19/hsrc/main.c b/d3q19/hsrc/main.c new file mode 100644 index 0000000..e0363b3 --- /dev/null +++ b/d3q19/hsrc/main.c @@ -0,0 +1,117 @@ +/* Host Application */ + +#include +#include +#include +#include +#include +#include + +#include +#include "../shared.h" + +#define FAIL(...) { fprintf(stderr, __VA_ARGS__); exit(1); } +#define SHM_OFFSET 0x01000000 + +/* helper functions */ +void fixsudo(const char *filename); +void write_populations(block_t f[CORES_Y][CORES_X], int iter); +void write_density (block_t f[CORES_Y][CORES_X], int iter); +void write_velocity (block_t f[CORES_Y][CORES_X], int iter); +void convert_to_gif(void); +void convert_to_mp4(void); +void write_timers(times_t t[CORES_Y][CORES_X], int iter); + +/* globals */ +static shm_t shm = { 0 }; /* local shm copy */ +static uint32_t pollflag; + +int main() +{ + char *filename = "bin/main.srec"; + + /* remove old results */ + int dummy = system("rm -f ./tmp/i*.ppm ./tmp/anim.gif ./tmp/anim.mp4 populations.dat timers.dat"); + (void)dummy; + + e_epiphany_t dev; + e_mem_t mem; + + e_set_host_verbosity(H_D0); + e_set_loader_verbosity(L_D0); + + /* initialize workgroup, allocate and clear shared memory */ + if(e_init(NULL) != E_OK) + FAIL("Can't init!\n"); + e_reset_system(); + if(e_open(&dev, 0, 0, CORES_X, CORES_Y) != E_OK) + FAIL("Can't open!\n"); + if(e_alloc(&mem, SHM_OFFSET, sizeof(shm_t)) != E_OK) + FAIL("Can't alloc!\n"); + if(e_write(&mem, 0, 0, (off_t)0, &shm, sizeof(shm_t)) == E_ERR) + FAIL("Can't clear shm!\n"); + + /* load programs */ + printf("Starting cores:\n"); + for(int y = 0; y < CORES_Y; y++) { + for(int x = 0; x < CORES_X; x++) { + printf("(%02d,%02d) ", x, y); + if(e_load(filename, &dev, x, y, E_TRUE) != E_OK) + FAIL("Can't load!\n"); + } + printf("\n"); + } + + /* ================================================================ */ + printf("Polling shared memory.\n"); + while(1) { + + while(1) { + /* read polling flag */ + if(e_read(&mem, 0, 0, (off_t)0, &pollflag, + sizeof(uint32_t)) == E_ERR) + FAIL("Can't read pollflag!\n"); + + /* wait for something */ + if(pollflag != POLL_BUSY) break; + } + + /* finish if done */ + if(pollflag == POLL_DONE) break; + + /* read full shared memory */ + if(e_read(&mem, 0, 0, (off_t)0, &shm, sizeof(shm_t)) == E_ERR) + FAIL("Can't read full shm!\n"); + + /* reset pollflag */ + pollflag = 0; + if(e_write(&mem, 0, 0, (off_t)0, &pollflag, + sizeof(uint32_t)) == E_ERR) { + FAIL("Can't reset pollflag!\n"); + } + + /* print iteration */ + printf("0x%08x\r", shm.iteration); fflush(stdout); + + /* write data */ + //write_populations(shm.lattice, shm.iteration); + write_density(shm.lattice, shm.iteration); + //write_velocity(shm.lattice, shm.iteration); + write_timers(shm.times, shm.iteration); + } + /* ================================================================ */ + + if(e_free(&mem) != E_OK) FAIL("Can't free!\n"); + if(e_close(&dev) != E_OK) FAIL("Can't close!\n"); + if(e_finalize() != E_OK) FAIL("Can't finalize!\n"); + + fixsudo("populations.dat"); + fixsudo("timers.dat"); + + printf("\nProgram finished successfully.\n"); + printf("Convert ...\n"); + convert_to_gif(); + + return(0); +} + diff --git a/d3q19/shared.h b/d3q19/shared.h new file mode 100644 index 0000000..b0c278e --- /dev/null +++ b/d3q19/shared.h @@ -0,0 +1,58 @@ +/* shared data types and external memory layout */ +#ifndef _SHARED_H_ +#define _SHARED_H_ + +#include + +/* ================================================================== */ + +/* number of cores */ +#define CORES_X 4 +#define CORES_Y 4 + +/* number of nodes per core */ +#define NODES_X 10 +#define NODES_Y 3 +#define NODES_Z 10 + +/* ================================================================== */ + +/* preprocessor magic */ +#define BUILD_BUG(c) do { ((void)sizeof(char[1 - 2*!!(c)])); } while(0); +#define USED __attribute__((used)) +#undef PACKED +#define PACKED __attribute__((packed)) +#undef ALIGN +#define ALIGN(X) __attribute__((aligned(X))) + +/* some calculations */ +#define CORES (CORES_X * CORES_Y) +#define NODES (NODES_X * NODES_Y * NODES_Z) +#define LATTICE_X (NODES_X * CORES_X) +#define LATTICE_Y (NODES_Y) +#define LATTICE_Z (NODES_Z * CORES_Y) + +/* number of timer values */ +#define TIMERS 12 + +/* pollflag values */ +#define POLL_BUSY 0x00 +#define POLL_READY 0x01 +#define POLL_DONE 0x02 + +/* data types */ +typedef float FLOAT; +typedef FLOAT node_t[19]; +typedef node_t block_t[NODES_Z][NODES_Y][NODES_X]; +typedef uint32_t times_t[TIMERS]; + +/* shared memory structure */ +typedef struct { + uint32_t pollflag; + uint32_t iteration; + times_t times[CORES_Y][CORES_X]; + block_t lattice[CORES_Y][CORES_X]; +} ALIGN(8) shm_t; + +#endif /* _SHARED_H_ */ + -- 2.30.2