From: Sebastian Date: Mon, 30 Jun 2014 22:08:10 +0000 (+0000) Subject: inner borders X-Git-Url: http://sraa.de/git/?a=commitdiff_plain;h=033d3e6771ae79e5f817e0d3c0d456e7e35b4656;p=lattice-boltzmann-epiphany.git inner borders - implement inner borders, still wrap-around the outer borders - shm_t uses CORES_X, CORES_Y instead of linearized numbers - change index order to be [y][x] everywhere - maximum size now 104x104 (using 26x26 blocks in a 4x4 grid) - compile-time bombs for anything larger - finally supports non-square block and grid sizes --- diff --git a/lb/esrc/d2q9.c b/lb/esrc/d2q9.c index 22f5f5f..c4ada9d 100644 --- a/lb/esrc/d2q9.c +++ b/lb/esrc/d2q9.c @@ -1,6 +1,8 @@ /* D2Q9 lattice boltzmann functions */ +#include #include "../shared.h" +#include "lb.h" /* velocities */ static const int d2q9_v[9][2] = { { 0, 0}, @@ -14,31 +16,33 @@ static const FLOAT d2q9_w[9] = { 4./9., 1./36., 1./9., 1./36., 1./9., }; -void init_block(d2q9_block_t block) +void d2q9_init(d2q9_block_t block) { /* all with rho = 0.1 */ - for(int x = 0; x < BLOCK_X; x++) - for(int y = 0; y < BLOCK_Y; y++) + for(int y = 0; y < BLOCK_Y; y++) + for(int x = 0; x < BLOCK_X; x++) for(int q = 0; q < 9; q++) - block[x][y][q] = 0.1 * d2q9_w[q]; + block[y][x][q] = 0.1 * d2q9_w[q]; - /* except here with 0.2 */ - for(int q = 0; q < 9; q++) - block[0][0][q] = 0.2 * d2q9_w[q]; + if(core == 0) { + /* except here with 0.2 */ + for(int q = 0; q < 9; q++) + block[0][0][q] = 0.2 * d2q9_w[q]; + } return; } -void collide_and_swap(d2q9_block_t f, int x, int y, FLOAT omega) +void d2q9_collide(d2q9_block_t f, int x, int y, FLOAT omega) { /* macroscopic */ - FLOAT rho = f[x][y][0] + f[x][y][1] + f[x][y][2] + - f[x][y][3] + f[x][y][4] + f[x][y][5] + - f[x][y][6] + f[x][y][7] + f[x][y][8]; - FLOAT ux = (f[x][y][7] + f[x][y][6] + f[x][y][5] - - f[x][y][1] - f[x][y][2] - f[x][y][3]) / rho; - FLOAT uy = (f[x][y][1] + f[x][y][8] + f[x][y][7] - - f[x][y][3] - f[x][y][4] - f[x][y][5]) / rho; + FLOAT rho = f[y][x][0] + f[y][x][1] + f[y][x][2] + + f[y][x][3] + f[y][x][4] + f[y][x][5] + + f[y][x][6] + f[y][x][7] + f[y][x][8]; + FLOAT ux = (f[y][x][7] + f[y][x][6] + f[y][x][5] - + f[y][x][1] - f[y][x][2] - f[y][x][3]) / rho; + FLOAT uy = (f[y][x][1] + f[y][x][8] + f[y][x][7] - + f[y][x][3] - f[y][x][4] - f[y][x][5]) / rho; FLOAT sqr = 1.5 * (ux*ux + uy*uy); /* update node */ @@ -46,49 +50,64 @@ void collide_and_swap(d2q9_block_t f, int x, int y, FLOAT omega) FLOAT cu = ux*d2q9_v[q][0] + uy*d2q9_v[q][1]; FLOAT eq = rho * d2q9_w[q] * (1. + 3. * cu + 4.5 * cu*cu - sqr); - f[x][y][q] *= (1.0 - omega); - f[x][y][q] += omega * eq; + f[y][x][q] *= (1.0 - omega); + f[y][x][q] += omega * eq; } /* swap */ for(int q = 1; q <= 4; q++) { - FLOAT tmp = f[x][y][q]; - f[x][y][q] = f[x][y][q+4]; - f[x][y][q+4] = tmp; + FLOAT tmp = f[y][x][q]; + f[y][x][q] = f[y][x][q+4]; + f[y][x][q+4] = tmp; } } -void stream_node(d2q9_block_t f, int x, int y) +void d2q9_stream(d2q9_block_t f, int x, int y) { for(int q = 1; q <= 4; q++) { - int next_x = x + d2q9_v[q][0]; - int next_y = y + d2q9_v[q][1]; - - /* wrap around */ - if(next_x < 0) next_x += BLOCK_X; - else if(next_x >= BLOCK_X) next_x -= BLOCK_X; - if(next_y < 0) next_y += BLOCK_Y; - else if(next_y >= BLOCK_Y) next_y -= BLOCK_Y; - - FLOAT tmp = f[x][y][q+4]; - f[x][y][q+4] = f[next_x][next_y][q]; - f[next_x][next_y][q] = tmp; + int next_row = row; + int next_col = col; + int next_x = x + d2q9_v[q][0]; + int next_y = y + d2q9_v[q][1]; + + /* inner borders (extend) */ + if(next_x < 0) { next_col--; next_x += BLOCK_X; } + else if(next_x >= BLOCK_X) { next_col++; next_x -= BLOCK_X; } + if(next_y < 0) { next_row--; next_y += BLOCK_Y; } + else if(next_y >= BLOCK_Y) { next_row++; next_y -= BLOCK_Y; } + + /* outer borders (wrap around) */ + if(next_col < 0) { next_col += CORES_X; } + else if(next_col >= CORES_X) { next_col -= CORES_X; } + if(next_row < 0) { next_row += CORES_Y; } + else if(next_row >= CORES_Y) { next_row -= CORES_Y; } + + /* f: local block, g: local or remote block */ + d2q9_block_t *g = (void*)f; + if(next_row != row || next_col != col) { + g = e_get_global_address(next_col, next_row, (void*)f); + } + + /* stream/swap f and g */ + FLOAT tmp = f[y][x][q+4]; + f[y][x][q+4] = (*g)[next_y][next_x][q]; + (*g)[next_y][next_x][q] = tmp; } } -void collide_and_stream_bulk(d2q9_block_t f, FLOAT omega) +void d2q9_collide_stream_bulk(d2q9_block_t f, FLOAT omega) { /* don't touch the border nodes */ for(int x = 1; x < BLOCK_X-1; x++) { for(int y = 1; y < BLOCK_Y-1; y++) { /* macroscopic */ - FLOAT rho = f[x][y][0] + f[x][y][1] + f[x][y][2] + - f[x][y][3] + f[x][y][4] + f[x][y][5] + - f[x][y][6] + f[x][y][7] + f[x][y][8]; - FLOAT ux = (f[x][y][7] + f[x][y][6] + f[x][y][5] - - f[x][y][1] - f[x][y][2] - f[x][y][3]) / rho; - FLOAT uy = (f[x][y][1] + f[x][y][8] + f[x][y][7] - - f[x][y][3] - f[x][y][4] - f[x][y][5]) / rho; + FLOAT rho = f[y][x][0] + f[y][x][1] + f[y][x][2] + + f[y][x][3] + f[y][x][4] + f[y][x][5] + + f[y][x][6] + f[y][x][7] + f[y][x][8]; + FLOAT ux = (f[y][x][7] + f[y][x][6] + f[y][x][5] - + f[y][x][1] - f[y][x][2] - f[y][x][3]) / rho; + FLOAT uy = (f[y][x][1] + f[y][x][8] + f[y][x][7] - + f[y][x][3] - f[y][x][4] - f[y][x][5]) / rho; FLOAT sqr = 1.5 * (ux*ux + uy*uy); /* update node */ @@ -96,8 +115,8 @@ void collide_and_stream_bulk(d2q9_block_t f, FLOAT omega) FLOAT cu = ux*d2q9_v[q][0] + uy*d2q9_v[q][1]; FLOAT eq = rho * d2q9_w[q] * (1. + 3. * cu + 4.5 * cu*cu - sqr); - f[x][y][q] *= (1.0 - omega); - f[x][y][q] += omega * eq; + f[y][x][q] *= (1.0 - omega); + f[y][x][q] += omega * eq; } /* stream */ @@ -105,10 +124,10 @@ void collide_and_stream_bulk(d2q9_block_t f, FLOAT omega) int next_x = x + d2q9_v[q][0]; int next_y = y + d2q9_v[q][1]; - FLOAT tmp = f[x][y][q]; - f[x][y][q] = f[x][y][q+4]; - f[x][y][q+4] = f[next_x][next_y][q]; - f[next_x][next_y][q] = tmp; + FLOAT tmp = f[y][x][q]; + f[y][x][q] = f[y][x][q+4]; + f[y][x][q+4] = f[next_y][next_x][q]; + f[next_y][next_x][q] = tmp; } } } diff --git a/lb/esrc/lb.h b/lb/esrc/lb.h index 9c099bf..a72b39b 100644 --- a/lb/esrc/lb.h +++ b/lb/esrc/lb.h @@ -1,14 +1,13 @@ -/* D2Q9 lattice boltzmann functions */ +/* lattice boltzmann functions */ #include "../shared.h" -void init_block (d2q9_block_t); - -void collide_and_swap (d2q9_block_t, int x, int y, FLOAT); -void stream_node (d2q9_block_t, int x, int y); - -void collide_and_stream_bulk(d2q9_block_t, FLOAT); - - +/* core index */ +extern unsigned int row, col, core; +/* D2Q9 functions */ +void d2q9_init (d2q9_block_t); +void d2q9_collide (d2q9_block_t, int x, int y, FLOAT); +void d2q9_stream (d2q9_block_t, int x, int y); +void d2q9_collide_stream_bulk(d2q9_block_t, FLOAT); diff --git a/lb/esrc/lb_2d.c b/lb/esrc/lb_2d.c index d58dedf..094ed79 100644 --- a/lb/esrc/lb_2d.c +++ b/lb/esrc/lb_2d.c @@ -11,87 +11,107 @@ /* shared memory overlay */ volatile shm_t shm SECTION(".shared_dram"); -/* statically allocate dummy memory - to prevent linker from putting stuff there */ -uint8_t dummy_bank1[8192] SECTION(".data_bank1"); -uint8_t dummy_bank2[8192] SECTION(".data_bank2"); -uint8_t dummy_bank3[8192] SECTION(".data_bank3"); +/* statically allocate dummy memory and local block overlay + to prevent linker from putting stuff in banks 1..3 */ +static uint8_t dummy_bank1[8192] UNUSED SECTION(".data_bank1"); +static uint8_t dummy_bank2[8192] UNUSED SECTION(".data_bank2"); +static uint8_t dummy_bank3[8192] UNUSED SECTION(".data_bank3"); +static d2q9_block_t *block = (void*)0x2000; -/* local block, aliased over dummy_bankX */ -d2q9_block_t *block = (void*)0x2000; +/* barrier structures */ +volatile e_barrier_t barriers[NUM_CORES]; + e_barrier_t *tgt_bars[NUM_CORES]; -void delay(void) +/* global index variables */ +unsigned int row, col, core; + +void delay(int x) { - for(volatile int i = 0; i < 1000000; i++) - for(volatile int j = 0; j < 10; j++) + for(volatile int j = 0; j < x; j++) + for(volatile int i = 0; i < 1000000; i++) ; } -int main() +void init(void) { + /* compile-time checks */ BUILD_BUG(BLOCK_X * BLOCK_Y * sizeof(d2q9_node_t) > 24*1024); + BUILD_BUG(CORES_X > 4 || CORES_Y > 4); + + /* core index */ + e_coords_from_coreid(e_get_coreid(), &col, &row); + core = row * CORES_X + col; + /* barrier initialization */ + e_barrier_init(barriers, tgt_bars); +} + +int main() +{ const FLOAT omega = 1.0; - init_block(*block); + init(); + d2q9_init(*block); - while(1) { + for(int i = 0; i < 10000; i++) { #if 0 /* collide all nodes */ - for(int x = 0; x < BLOCK_X; x++) - for(int y = 0; y < BLOCK_Y; y++) - collide_and_swap(*block, x, y, omega); + for(int y = 0; y < BLOCK_Y; y++) + for(int x = 0; x < BLOCK_X; x++) + d2q9_collide(*block, x, y, omega); - /* XXX synchronize */ + /* synchronize */ + e_barrier(barriers, tgt_bars); /* stream all nodes */ - for(int x = 0; x < BLOCK_X; x++) - for(int y = 0; y < BLOCK_Y; y++) - stream_node(*block, x, y); - - /* XXX synchronize */ + for(int y = 0; y < BLOCK_Y; y++) + for(int x = 0; x < BLOCK_X; x++) + d2q9_stream(*block, x, y); #else /* collide boundaries: top, bottom */ for(int x = 0; x < BLOCK_X; x++) { - collide_and_swap(*block, x, 0, omega); - collide_and_swap(*block, x, BLOCK_Y-1, omega); + d2q9_collide(*block, x, 0, omega); + d2q9_collide(*block, x, BLOCK_Y-1, omega); } /* collide boundaries: left, right */ for(int y = 1; y < BLOCK_Y-1; y++) { - collide_and_swap(*block, 0, y, omega); - collide_and_swap(*block, BLOCK_X-1, y, omega); + d2q9_collide(*block, 0, y, omega); + d2q9_collide(*block, BLOCK_X-1, y, omega); } - /* XXX synchronize */ + /* synchronize */ + e_barrier(barriers, tgt_bars); /* collide and stream the bulk */ - collide_and_stream_bulk(*block, omega); + d2q9_collide_stream_bulk(*block, omega); /* stream the boundaries: left, right */ for(int x = 0; x < BLOCK_X; x++) { - stream_node(*block, x, 0 ); - stream_node(*block, x, BLOCK_Y-1); + d2q9_stream(*block, x, 0 ); + d2q9_stream(*block, x, BLOCK_Y-1); } /* stream the boundaries: left, right */ for(int y = 1; y < BLOCK_Y-1; y++) { - stream_node(*block, 0, y); - stream_node(*block, BLOCK_X-1, y); + d2q9_stream(*block, 0, y); + d2q9_stream(*block, BLOCK_X-1, y); } - - /* XXX synchronize */ #endif - - /* copy grid to shm */ - memcpy(&shm.lattice[0], block, sizeof(d2q9_block_t)); + /* copy grid to shm and synchronize */ + memcpy(&shm.lattice[row][col], block, sizeof(d2q9_block_t)); + e_barrier(barriers, tgt_bars); /* flag host */ - shm.states[0]++; + if(core == 0 && !(i%100)) { + shm.states[row][col]++; + delay(1); + } } + shm.states[row][col] = -1; while(1); } diff --git a/lb/hsrc/main.c b/lb/hsrc/main.c index 971425a..1e90eac 100644 --- a/lb/hsrc/main.c +++ b/lb/hsrc/main.c @@ -1,7 +1,6 @@ /* Host Application */ #include -#include #include #include #include @@ -14,16 +13,16 @@ #define SHM_OFFSET 0x01000000 static states_t laststates, states; /* old state value */ -static shm_t shm = {{ 0 }}; /* local shm copy */ +static shm_t shm = {{{ 0 }}}; /* local shm copy */ -void write_populations(FILE *file, int core, int iter) +void write_populations(FILE *file, int core_x, int core_y, int iter) { - for(int x = 0; x < BLOCK_X; x++) { - for(int y = 0; y < BLOCK_Y; y++) { + for(int y = 0; y < BLOCK_Y; y++) { + for(int x = 0; x < BLOCK_X; x++) { fprintf(file, "%03d: [%02d,%02d]: ", iter, x, y); for(int q = 0; q < 9; q++) { fprintf(file, "%.5f\t", - shm.lattice[core][x][y][q]); + shm.lattice[core_y][core_x][y][x][q]); } fprintf(file, "\n"); } @@ -37,52 +36,49 @@ void write_image(int iter) { FILE *file; char name[32]; - FLOAT rhos[4][4][BLOCK_X][BLOCK_Y]; - FLOAT min = 1, max = 0; - uint8_t gray; - snprintf(name, 32, "./tmp/i%06d.ppm", iter); file = fopen(name, "wb"); if(!file) exit(-1); - fprintf(file, "P5\n%d %d\n%d\n", BLOCK_X, BLOCK_Y, 255); + fprintf(file, "P5\n%d %d\n%d\n", + CORES_X*BLOCK_X, CORES_Y*BLOCK_Y, 255); /* calculate all densities and remember min/max */ - int cx = 0, cy = 0; -// for(int cy = 0; cy < 1; cy++) { -// for(int cx = 0; cx < 1; cx++) { - for(int x = 0; x < BLOCK_X; x++) { - for(int y = 0; y < BLOCK_Y; y++) { - rhos[cy][cx][x][y] = 0; - for(int q = 0; q < 9; q++) { - rhos[cy][cx][x][y] += - shm.lattice[cy*4+cx][x][y][q]; - } - - if(rhos[cy][cx][x][y] < min) - min = rhos[cy][cx][x][y]; - if(rhos[cy][cx][x][y] > max) - max = rhos[cy][cx][x][y]; + FLOAT min = 1.0, max = 0; + FLOAT rhos[CORES_Y][BLOCK_Y][CORES_X][BLOCK_X]; + for(int cy = 0; cy < CORES_Y; cy++) { + for(int y = 0; y < BLOCK_Y; y++) { + for(int cx = 0; cx < CORES_X; cx++) { + for(int x = 0; x < BLOCK_X; x++) { + FLOAT rho = 0; + for(int q = 0; q < 9; q++) + rho += shm.lattice[cy][cx][y][x][q]; + rhos[cy][y][cx][x] = rho; + + if(rho < min) min = rho; + if(rho > max) max = rho; } } -// } -// } + } + } /* now scale values and write to image file */ -// for(int cy = 0; cy < 4; cy++) { -// for(int cx = 0; cx < 4; cx++) { - for(int x = 0; x < BLOCK_X; x++) { - for(int y = 0; y < BLOCK_Y; y++) { - gray = (int)(255.*(rhos[cy][cx][x][y]-min) / (max-min)); + for(int cy = 0; cy < CORES_Y; cy++) { + for(int y = 0; y < BLOCK_Y; y++) { + for(int cx = 0; cx < CORES_X; cx++) { + for(int x = 0; x < BLOCK_X; x++) { + unsigned char gray; + gray = (255. * (rhos[cy][y][cx][x]-min) + / (max-min)); fwrite(&gray, 1, 1, file); } } -// } -// } + } + } fclose(file); if(chown(name, atoi(getenv("SUDO_UID")), atoi(getenv("SUDO_GID")))) { - perror("chown"); + FAIL("Can't chown image!\n"); } return; @@ -94,14 +90,16 @@ int main() FILE *datfile; char *datname = "populations.dat"; int dummy, old0 = 0; + /* remove old results */ + dummy = system("rm -f ./tmp/i*.ppm ./tmp/anim.gif populations.dat"); + (void)dummy; + e_epiphany_t dev; e_mem_t mem; e_set_host_verbosity(H_D0); e_set_loader_verbosity(L_D0); - dummy = system("rm ./tmp/i*.ppm ./tmp/anim.gif"); (void)dummy; - /* overwrite results file */ datfile = fopen(datname, "w"); if(!datfile) @@ -111,7 +109,7 @@ int main() if(e_init(NULL) != E_OK) FAIL("Can't init!\n"); e_reset_system(); - if(e_open(&dev, 0, 0, 4, 4) != E_OK) + if(e_open(&dev, 0, 0, CORES_X, CORES_Y) != E_OK) FAIL("Can't open!\n"); if(e_alloc(&mem, SHM_OFFSET, sizeof(shm_t)) != E_OK) FAIL("Can't alloc!\n"); @@ -119,12 +117,19 @@ int main() FAIL("Can't clear shm!\n"); /* load programs */ - if(e_load(filename, &dev, 0, 0, E_TRUE) != E_OK) - FAIL("Can't load!\n"); + printf("Starting cores:\n"); + for(int y = 0; y < CORES_Y; y++) { + for(int x = 0; x < CORES_X; x++) { + printf("(%02d,%02d) ", x, y); + if(e_load(filename, &dev, x, y, E_TRUE) != E_OK) + FAIL("Can't load!\n"); + } + printf("\n"); + } /* ================================================================ */ + printf("Polling shared memory.\n"); while(1) { - printf("Polling shared memory.\n"); while(1) { /* read states */ @@ -146,23 +151,27 @@ int main() memcpy(&laststates, &shm, sizeof(states_t)); /* print states */ - printf("\t0x%08x 0x%08x 0x%08x 0x%08x\n", - states[0], states[1], states[2], states[3]); - printf("\t0x%08x 0x%08x 0x%08x 0x%08x\n", - states[4], states[5], states[6], states[7]); - printf("\t0x%08x 0x%08x 0x%08x 0x%08x\n", - states[8], states[9], states[10], states[11]); - printf("\t0x%08x 0x%08x 0x%08x 0x%08x\n", - states[12], states[13], states[14], states[15]); - - /* write populations */ - if(states[0] != old0) { - write_populations(datfile, 0, states[0]); - write_image(states[0]); - old0 = states[0]; +#if 0 + for(int y = 0; y < CORES_Y; y++) { + printf("\t"); + for(int x = 0; x < CORES_X; x++) { + printf("0x%08x ", states[y][x]); + } + printf("\n"); + } +#else + printf("0x%08x\r", states[0][0]); + fflush(stdout); +#endif + + /* write data */ + if(states[0][0] != old0) { + //write_populations(datfile, 0, states[0]); + write_image(states[0][0]); + old0 = states[0][0]; } - if(states[0] >= 21) break; + if(states[0][0] == -1) break; } /* ================================================================ */ diff --git a/lb/shared.h b/lb/shared.h index 7000418..e752a35 100644 --- a/lb/shared.h +++ b/lb/shared.h @@ -4,35 +4,36 @@ #include -/* PACKED is defined for e-gcc, but not for gcc */ +/* preprocessor magic */ +#define BUILD_BUG(c) do { ((void)sizeof(char[1 - 2*!!(c)])); } while(0); +#define UNUSED __attribute__((unused)) #ifndef PACKED #define PACKED __attribute__((packed)) #endif /* PACKED */ -/* produce compile-time errors if condition is true */ -#define BUILD_BUG(c) do { ((void)sizeof(char[1 - 2*!!(c)])); } while(0); - /* number of cores */ -#define NUM_CORES 16 +#define CORES_X 4 +#define CORES_Y 4 +#define NUM_CORES (CORES_X * CORES_Y) /* size of per-core subgrid */ -#define BLOCK_X 15 -#define BLOCK_Y 15 +#define BLOCK_X 26 +#define BLOCK_Y 26 /* floating point type */ typedef float FLOAT; /* state type */ -typedef uint32_t states_t[NUM_CORES]; +typedef uint32_t states_t[CORES_Y][CORES_X]; /* node and block type (D2Q9) */ typedef FLOAT d2q9_node_t[9]; -typedef d2q9_node_t d2q9_block_t[BLOCK_X][BLOCK_Y]; +typedef d2q9_node_t d2q9_block_t[BLOCK_Y][BLOCK_X]; /* shared memory structure */ typedef struct { states_t states; - d2q9_block_t lattice[NUM_CORES]; + d2q9_block_t lattice[CORES_Y][CORES_X]; } PACKED shm_t; #endif /* _SHARED_H_ */