/* D2Q9 lattice boltzmann functions */
+#include <e-lib.h>
#include "../shared.h"
+#include "lb.h"
/* velocities */
static const int d2q9_v[9][2] = { { 0, 0},
1./36., 1./9., 1./36., 1./9.,
};
-void init_block(d2q9_block_t block)
+void d2q9_init(d2q9_block_t block)
{
/* all with rho = 0.1 */
- for(int x = 0; x < BLOCK_X; x++)
- for(int y = 0; y < BLOCK_Y; y++)
+ for(int y = 0; y < BLOCK_Y; y++)
+ for(int x = 0; x < BLOCK_X; x++)
for(int q = 0; q < 9; q++)
- block[x][y][q] = 0.1 * d2q9_w[q];
+ block[y][x][q] = 0.1 * d2q9_w[q];
- /* except here with 0.2 */
- for(int q = 0; q < 9; q++)
- block[0][0][q] = 0.2 * d2q9_w[q];
+ if(core == 0) {
+ /* except here with 0.2 */
+ for(int q = 0; q < 9; q++)
+ block[0][0][q] = 0.2 * d2q9_w[q];
+ }
return;
}
-void collide_and_swap(d2q9_block_t f, int x, int y, FLOAT omega)
+void d2q9_collide(d2q9_block_t f, int x, int y, FLOAT omega)
{
/* macroscopic */
- FLOAT rho = f[x][y][0] + f[x][y][1] + f[x][y][2] +
- f[x][y][3] + f[x][y][4] + f[x][y][5] +
- f[x][y][6] + f[x][y][7] + f[x][y][8];
- FLOAT ux = (f[x][y][7] + f[x][y][6] + f[x][y][5] -
- f[x][y][1] - f[x][y][2] - f[x][y][3]) / rho;
- FLOAT uy = (f[x][y][1] + f[x][y][8] + f[x][y][7] -
- f[x][y][3] - f[x][y][4] - f[x][y][5]) / rho;
+ FLOAT rho = f[y][x][0] + f[y][x][1] + f[y][x][2] +
+ f[y][x][3] + f[y][x][4] + f[y][x][5] +
+ f[y][x][6] + f[y][x][7] + f[y][x][8];
+ FLOAT ux = (f[y][x][7] + f[y][x][6] + f[y][x][5] -
+ f[y][x][1] - f[y][x][2] - f[y][x][3]) / rho;
+ FLOAT uy = (f[y][x][1] + f[y][x][8] + f[y][x][7] -
+ f[y][x][3] - f[y][x][4] - f[y][x][5]) / rho;
FLOAT sqr = 1.5 * (ux*ux + uy*uy);
/* update node */
FLOAT cu = ux*d2q9_v[q][0] + uy*d2q9_v[q][1];
FLOAT eq = rho * d2q9_w[q] *
(1. + 3. * cu + 4.5 * cu*cu - sqr);
- f[x][y][q] *= (1.0 - omega);
- f[x][y][q] += omega * eq;
+ f[y][x][q] *= (1.0 - omega);
+ f[y][x][q] += omega * eq;
}
/* swap */
for(int q = 1; q <= 4; q++) {
- FLOAT tmp = f[x][y][q];
- f[x][y][q] = f[x][y][q+4];
- f[x][y][q+4] = tmp;
+ FLOAT tmp = f[y][x][q];
+ f[y][x][q] = f[y][x][q+4];
+ f[y][x][q+4] = tmp;
}
}
-void stream_node(d2q9_block_t f, int x, int y)
+void d2q9_stream(d2q9_block_t f, int x, int y)
{
for(int q = 1; q <= 4; q++) {
- int next_x = x + d2q9_v[q][0];
- int next_y = y + d2q9_v[q][1];
-
- /* wrap around */
- if(next_x < 0) next_x += BLOCK_X;
- else if(next_x >= BLOCK_X) next_x -= BLOCK_X;
- if(next_y < 0) next_y += BLOCK_Y;
- else if(next_y >= BLOCK_Y) next_y -= BLOCK_Y;
-
- FLOAT tmp = f[x][y][q+4];
- f[x][y][q+4] = f[next_x][next_y][q];
- f[next_x][next_y][q] = tmp;
+ int next_row = row;
+ int next_col = col;
+ int next_x = x + d2q9_v[q][0];
+ int next_y = y + d2q9_v[q][1];
+
+ /* inner borders (extend) */
+ if(next_x < 0) { next_col--; next_x += BLOCK_X; }
+ else if(next_x >= BLOCK_X) { next_col++; next_x -= BLOCK_X; }
+ if(next_y < 0) { next_row--; next_y += BLOCK_Y; }
+ else if(next_y >= BLOCK_Y) { next_row++; next_y -= BLOCK_Y; }
+
+ /* outer borders (wrap around) */
+ if(next_col < 0) { next_col += CORES_X; }
+ else if(next_col >= CORES_X) { next_col -= CORES_X; }
+ if(next_row < 0) { next_row += CORES_Y; }
+ else if(next_row >= CORES_Y) { next_row -= CORES_Y; }
+
+ /* f: local block, g: local or remote block */
+ d2q9_block_t *g = (void*)f;
+ if(next_row != row || next_col != col) {
+ g = e_get_global_address(next_col, next_row, (void*)f);
+ }
+
+ /* stream/swap f and g */
+ FLOAT tmp = f[y][x][q+4];
+ f[y][x][q+4] = (*g)[next_y][next_x][q];
+ (*g)[next_y][next_x][q] = tmp;
}
}
-void collide_and_stream_bulk(d2q9_block_t f, FLOAT omega)
+void d2q9_collide_stream_bulk(d2q9_block_t f, FLOAT omega)
{
/* don't touch the border nodes */
for(int x = 1; x < BLOCK_X-1; x++) {
for(int y = 1; y < BLOCK_Y-1; y++) {
/* macroscopic */
- FLOAT rho = f[x][y][0] + f[x][y][1] + f[x][y][2] +
- f[x][y][3] + f[x][y][4] + f[x][y][5] +
- f[x][y][6] + f[x][y][7] + f[x][y][8];
- FLOAT ux = (f[x][y][7] + f[x][y][6] + f[x][y][5] -
- f[x][y][1] - f[x][y][2] - f[x][y][3]) / rho;
- FLOAT uy = (f[x][y][1] + f[x][y][8] + f[x][y][7] -
- f[x][y][3] - f[x][y][4] - f[x][y][5]) / rho;
+ FLOAT rho = f[y][x][0] + f[y][x][1] + f[y][x][2] +
+ f[y][x][3] + f[y][x][4] + f[y][x][5] +
+ f[y][x][6] + f[y][x][7] + f[y][x][8];
+ FLOAT ux = (f[y][x][7] + f[y][x][6] + f[y][x][5] -
+ f[y][x][1] - f[y][x][2] - f[y][x][3]) / rho;
+ FLOAT uy = (f[y][x][1] + f[y][x][8] + f[y][x][7] -
+ f[y][x][3] - f[y][x][4] - f[y][x][5]) / rho;
FLOAT sqr = 1.5 * (ux*ux + uy*uy);
/* update node */
FLOAT cu = ux*d2q9_v[q][0] + uy*d2q9_v[q][1];
FLOAT eq = rho * d2q9_w[q] *
(1. + 3. * cu + 4.5 * cu*cu - sqr);
- f[x][y][q] *= (1.0 - omega);
- f[x][y][q] += omega * eq;
+ f[y][x][q] *= (1.0 - omega);
+ f[y][x][q] += omega * eq;
}
/* stream */
int next_x = x + d2q9_v[q][0];
int next_y = y + d2q9_v[q][1];
- FLOAT tmp = f[x][y][q];
- f[x][y][q] = f[x][y][q+4];
- f[x][y][q+4] = f[next_x][next_y][q];
- f[next_x][next_y][q] = tmp;
+ FLOAT tmp = f[y][x][q];
+ f[y][x][q] = f[y][x][q+4];
+ f[y][x][q+4] = f[next_y][next_x][q];
+ f[next_y][next_x][q] = tmp;
}
}
}
-/* D2Q9 lattice boltzmann functions */
+/* lattice boltzmann functions */
#include "../shared.h"
-void init_block (d2q9_block_t);
-
-void collide_and_swap (d2q9_block_t, int x, int y, FLOAT);
-void stream_node (d2q9_block_t, int x, int y);
-
-void collide_and_stream_bulk(d2q9_block_t, FLOAT);
-
-
+/* core index */
+extern unsigned int row, col, core;
+/* D2Q9 functions */
+void d2q9_init (d2q9_block_t);
+void d2q9_collide (d2q9_block_t, int x, int y, FLOAT);
+void d2q9_stream (d2q9_block_t, int x, int y);
+void d2q9_collide_stream_bulk(d2q9_block_t, FLOAT);
/* shared memory overlay */
volatile shm_t shm SECTION(".shared_dram");
-/* statically allocate dummy memory
- to prevent linker from putting stuff there */
-uint8_t dummy_bank1[8192] SECTION(".data_bank1");
-uint8_t dummy_bank2[8192] SECTION(".data_bank2");
-uint8_t dummy_bank3[8192] SECTION(".data_bank3");
+/* statically allocate dummy memory and local block overlay
+ to prevent linker from putting stuff in banks 1..3 */
+static uint8_t dummy_bank1[8192] UNUSED SECTION(".data_bank1");
+static uint8_t dummy_bank2[8192] UNUSED SECTION(".data_bank2");
+static uint8_t dummy_bank3[8192] UNUSED SECTION(".data_bank3");
+static d2q9_block_t *block = (void*)0x2000;
-/* local block, aliased over dummy_bankX */
-d2q9_block_t *block = (void*)0x2000;
+/* barrier structures */
+volatile e_barrier_t barriers[NUM_CORES];
+ e_barrier_t *tgt_bars[NUM_CORES];
-void delay(void)
+/* global index variables */
+unsigned int row, col, core;
+
+void delay(int x)
{
- for(volatile int i = 0; i < 1000000; i++)
- for(volatile int j = 0; j < 10; j++)
+ for(volatile int j = 0; j < x; j++)
+ for(volatile int i = 0; i < 1000000; i++)
;
}
-int main()
+void init(void)
{
+ /* compile-time checks */
BUILD_BUG(BLOCK_X * BLOCK_Y * sizeof(d2q9_node_t) > 24*1024);
+ BUILD_BUG(CORES_X > 4 || CORES_Y > 4);
+
+ /* core index */
+ e_coords_from_coreid(e_get_coreid(), &col, &row);
+ core = row * CORES_X + col;
+ /* barrier initialization */
+ e_barrier_init(barriers, tgt_bars);
+}
+
+int main()
+{
const FLOAT omega = 1.0;
- init_block(*block);
+ init();
+ d2q9_init(*block);
- while(1) {
+ for(int i = 0; i < 10000; i++) {
#if 0
/* collide all nodes */
- for(int x = 0; x < BLOCK_X; x++)
- for(int y = 0; y < BLOCK_Y; y++)
- collide_and_swap(*block, x, y, omega);
+ for(int y = 0; y < BLOCK_Y; y++)
+ for(int x = 0; x < BLOCK_X; x++)
+ d2q9_collide(*block, x, y, omega);
- /* XXX synchronize */
+ /* synchronize */
+ e_barrier(barriers, tgt_bars);
/* stream all nodes */
- for(int x = 0; x < BLOCK_X; x++)
- for(int y = 0; y < BLOCK_Y; y++)
- stream_node(*block, x, y);
-
- /* XXX synchronize */
+ for(int y = 0; y < BLOCK_Y; y++)
+ for(int x = 0; x < BLOCK_X; x++)
+ d2q9_stream(*block, x, y);
#else
/* collide boundaries: top, bottom */
for(int x = 0; x < BLOCK_X; x++) {
- collide_and_swap(*block, x, 0, omega);
- collide_and_swap(*block, x, BLOCK_Y-1, omega);
+ d2q9_collide(*block, x, 0, omega);
+ d2q9_collide(*block, x, BLOCK_Y-1, omega);
}
/* collide boundaries: left, right */
for(int y = 1; y < BLOCK_Y-1; y++) {
- collide_and_swap(*block, 0, y, omega);
- collide_and_swap(*block, BLOCK_X-1, y, omega);
+ d2q9_collide(*block, 0, y, omega);
+ d2q9_collide(*block, BLOCK_X-1, y, omega);
}
- /* XXX synchronize */
+ /* synchronize */
+ e_barrier(barriers, tgt_bars);
/* collide and stream the bulk */
- collide_and_stream_bulk(*block, omega);
+ d2q9_collide_stream_bulk(*block, omega);
/* stream the boundaries: left, right */
for(int x = 0; x < BLOCK_X; x++) {
- stream_node(*block, x, 0 );
- stream_node(*block, x, BLOCK_Y-1);
+ d2q9_stream(*block, x, 0 );
+ d2q9_stream(*block, x, BLOCK_Y-1);
}
/* stream the boundaries: left, right */
for(int y = 1; y < BLOCK_Y-1; y++) {
- stream_node(*block, 0, y);
- stream_node(*block, BLOCK_X-1, y);
+ d2q9_stream(*block, 0, y);
+ d2q9_stream(*block, BLOCK_X-1, y);
}
-
- /* XXX synchronize */
#endif
-
- /* copy grid to shm */
- memcpy(&shm.lattice[0], block, sizeof(d2q9_block_t));
+ /* copy grid to shm and synchronize */
+ memcpy(&shm.lattice[row][col], block, sizeof(d2q9_block_t));
+ e_barrier(barriers, tgt_bars);
/* flag host */
- shm.states[0]++;
+ if(core == 0 && !(i%100)) {
+ shm.states[row][col]++;
+ delay(1);
+ }
}
+ shm.states[row][col] = -1;
while(1);
}
/* Host Application */
#include <stdio.h>
-#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#define SHM_OFFSET 0x01000000
static states_t laststates, states; /* old state value */
-static shm_t shm = {{ 0 }}; /* local shm copy */
+static shm_t shm = {{{ 0 }}}; /* local shm copy */
-void write_populations(FILE *file, int core, int iter)
+void write_populations(FILE *file, int core_x, int core_y, int iter)
{
- for(int x = 0; x < BLOCK_X; x++) {
- for(int y = 0; y < BLOCK_Y; y++) {
+ for(int y = 0; y < BLOCK_Y; y++) {
+ for(int x = 0; x < BLOCK_X; x++) {
fprintf(file, "%03d: [%02d,%02d]: ", iter, x, y);
for(int q = 0; q < 9; q++) {
fprintf(file, "%.5f\t",
- shm.lattice[core][x][y][q]);
+ shm.lattice[core_y][core_x][y][x][q]);
}
fprintf(file, "\n");
}
{
FILE *file; char name[32];
- FLOAT rhos[4][4][BLOCK_X][BLOCK_Y];
- FLOAT min = 1, max = 0;
- uint8_t gray;
-
snprintf(name, 32, "./tmp/i%06d.ppm", iter);
file = fopen(name, "wb");
if(!file) exit(-1);
- fprintf(file, "P5\n%d %d\n%d\n", BLOCK_X, BLOCK_Y, 255);
+ fprintf(file, "P5\n%d %d\n%d\n",
+ CORES_X*BLOCK_X, CORES_Y*BLOCK_Y, 255);
/* calculate all densities and remember min/max */
- int cx = 0, cy = 0;
-// for(int cy = 0; cy < 1; cy++) {
-// for(int cx = 0; cx < 1; cx++) {
- for(int x = 0; x < BLOCK_X; x++) {
- for(int y = 0; y < BLOCK_Y; y++) {
- rhos[cy][cx][x][y] = 0;
- for(int q = 0; q < 9; q++) {
- rhos[cy][cx][x][y] +=
- shm.lattice[cy*4+cx][x][y][q];
- }
-
- if(rhos[cy][cx][x][y] < min)
- min = rhos[cy][cx][x][y];
- if(rhos[cy][cx][x][y] > max)
- max = rhos[cy][cx][x][y];
+ FLOAT min = 1.0, max = 0;
+ FLOAT rhos[CORES_Y][BLOCK_Y][CORES_X][BLOCK_X];
+ for(int cy = 0; cy < CORES_Y; cy++) {
+ for(int y = 0; y < BLOCK_Y; y++) {
+ for(int cx = 0; cx < CORES_X; cx++) {
+ for(int x = 0; x < BLOCK_X; x++) {
+ FLOAT rho = 0;
+ for(int q = 0; q < 9; q++)
+ rho += shm.lattice[cy][cx][y][x][q];
+ rhos[cy][y][cx][x] = rho;
+
+ if(rho < min) min = rho;
+ if(rho > max) max = rho;
}
}
-// }
-// }
+ }
+ }
/* now scale values and write to image file */
-// for(int cy = 0; cy < 4; cy++) {
-// for(int cx = 0; cx < 4; cx++) {
- for(int x = 0; x < BLOCK_X; x++) {
- for(int y = 0; y < BLOCK_Y; y++) {
- gray = (int)(255.*(rhos[cy][cx][x][y]-min) / (max-min));
+ for(int cy = 0; cy < CORES_Y; cy++) {
+ for(int y = 0; y < BLOCK_Y; y++) {
+ for(int cx = 0; cx < CORES_X; cx++) {
+ for(int x = 0; x < BLOCK_X; x++) {
+ unsigned char gray;
+ gray = (255. * (rhos[cy][y][cx][x]-min)
+ / (max-min));
fwrite(&gray, 1, 1, file);
}
}
-// }
-// }
+ }
+ }
fclose(file);
if(chown(name, atoi(getenv("SUDO_UID")), atoi(getenv("SUDO_GID")))) {
- perror("chown");
+ FAIL("Can't chown image!\n");
}
return;
FILE *datfile; char *datname = "populations.dat";
int dummy, old0 = 0;
+ /* remove old results */
+ dummy = system("rm -f ./tmp/i*.ppm ./tmp/anim.gif populations.dat");
+ (void)dummy;
+
e_epiphany_t dev;
e_mem_t mem;
e_set_host_verbosity(H_D0);
e_set_loader_verbosity(L_D0);
- dummy = system("rm ./tmp/i*.ppm ./tmp/anim.gif"); (void)dummy;
-
/* overwrite results file */
datfile = fopen(datname, "w");
if(!datfile)
if(e_init(NULL) != E_OK)
FAIL("Can't init!\n");
e_reset_system();
- if(e_open(&dev, 0, 0, 4, 4) != E_OK)
+ if(e_open(&dev, 0, 0, CORES_X, CORES_Y) != E_OK)
FAIL("Can't open!\n");
if(e_alloc(&mem, SHM_OFFSET, sizeof(shm_t)) != E_OK)
FAIL("Can't alloc!\n");
FAIL("Can't clear shm!\n");
/* load programs */
- if(e_load(filename, &dev, 0, 0, E_TRUE) != E_OK)
- FAIL("Can't load!\n");
+ printf("Starting cores:\n");
+ for(int y = 0; y < CORES_Y; y++) {
+ for(int x = 0; x < CORES_X; x++) {
+ printf("(%02d,%02d) ", x, y);
+ if(e_load(filename, &dev, x, y, E_TRUE) != E_OK)
+ FAIL("Can't load!\n");
+ }
+ printf("\n");
+ }
/* ================================================================ */
+ printf("Polling shared memory.\n");
while(1) {
- printf("Polling shared memory.\n");
while(1) {
/* read states */
memcpy(&laststates, &shm, sizeof(states_t));
/* print states */
- printf("\t0x%08x 0x%08x 0x%08x 0x%08x\n",
- states[0], states[1], states[2], states[3]);
- printf("\t0x%08x 0x%08x 0x%08x 0x%08x\n",
- states[4], states[5], states[6], states[7]);
- printf("\t0x%08x 0x%08x 0x%08x 0x%08x\n",
- states[8], states[9], states[10], states[11]);
- printf("\t0x%08x 0x%08x 0x%08x 0x%08x\n",
- states[12], states[13], states[14], states[15]);
-
- /* write populations */
- if(states[0] != old0) {
- write_populations(datfile, 0, states[0]);
- write_image(states[0]);
- old0 = states[0];
+#if 0
+ for(int y = 0; y < CORES_Y; y++) {
+ printf("\t");
+ for(int x = 0; x < CORES_X; x++) {
+ printf("0x%08x ", states[y][x]);
+ }
+ printf("\n");
+ }
+#else
+ printf("0x%08x\r", states[0][0]);
+ fflush(stdout);
+#endif
+
+ /* write data */
+ if(states[0][0] != old0) {
+ //write_populations(datfile, 0, states[0]);
+ write_image(states[0][0]);
+ old0 = states[0][0];
}
- if(states[0] >= 21) break;
+ if(states[0][0] == -1) break;
}
/* ================================================================ */
#include <stdint.h>
-/* PACKED is defined for e-gcc, but not for gcc */
+/* preprocessor magic */
+#define BUILD_BUG(c) do { ((void)sizeof(char[1 - 2*!!(c)])); } while(0);
+#define UNUSED __attribute__((unused))
#ifndef PACKED
#define PACKED __attribute__((packed))
#endif /* PACKED */
-/* produce compile-time errors if condition is true */
-#define BUILD_BUG(c) do { ((void)sizeof(char[1 - 2*!!(c)])); } while(0);
-
/* number of cores */
-#define NUM_CORES 16
+#define CORES_X 4
+#define CORES_Y 4
+#define NUM_CORES (CORES_X * CORES_Y)
/* size of per-core subgrid */
-#define BLOCK_X 15
-#define BLOCK_Y 15
+#define BLOCK_X 26
+#define BLOCK_Y 26
/* floating point type */
typedef float FLOAT;
/* state type */
-typedef uint32_t states_t[NUM_CORES];
+typedef uint32_t states_t[CORES_Y][CORES_X];
/* node and block type (D2Q9) */
typedef FLOAT d2q9_node_t[9];
-typedef d2q9_node_t d2q9_block_t[BLOCK_X][BLOCK_Y];
+typedef d2q9_node_t d2q9_block_t[BLOCK_Y][BLOCK_X];
/* shared memory structure */
typedef struct {
states_t states;
- d2q9_block_t lattice[NUM_CORES];
+ d2q9_block_t lattice[CORES_Y][CORES_X];
} PACKED shm_t;
#endif /* _SHARED_H_ */