inner borders
authorSebastian <git@sraa.de>
Mon, 30 Jun 2014 22:08:10 +0000 (22:08 +0000)
committerSebastian <git@sraa.de>
Mon, 30 Jun 2014 22:08:10 +0000 (22:08 +0000)
- implement inner borders, still wrap-around the outer borders
- shm_t uses CORES_X, CORES_Y instead of linearized numbers
- change index order to be [y][x] everywhere
- maximum size now 104x104 (using 26x26 blocks in a 4x4 grid)
- compile-time bombs for anything larger
- finally supports non-square block and grid sizes

lb/esrc/d2q9.c
lb/esrc/lb.h
lb/esrc/lb_2d.c
lb/hsrc/main.c
lb/shared.h

index 22f5f5fc4cb319cfc199d40ece287b7fa6ead6e0..c4ada9d2bccd9bb6a6cf9304109eb00da6283094 100644 (file)
@@ -1,6 +1,8 @@
 /* D2Q9 lattice boltzmann functions */
 
+#include <e-lib.h>
 #include "../shared.h"
+#include "lb.h"
 
 /* velocities */
 static const int d2q9_v[9][2] = { { 0, 0},
@@ -14,31 +16,33 @@ static const FLOAT d2q9_w[9] = { 4./9.,
        1./36., 1./9., 1./36., 1./9.,
 };
 
-void init_block(d2q9_block_t block)
+void d2q9_init(d2q9_block_t block)
 {
        /* all with rho = 0.1 */
-       for(int x = 0; x < BLOCK_X; x++)
-               for(int y = 0; y < BLOCK_Y; y++)
+       for(int y = 0; y < BLOCK_Y; y++)
+               for(int x = 0; x < BLOCK_X; x++)
                        for(int q = 0; q < 9; q++)
-                               block[x][y][q] = 0.1 * d2q9_w[q];
+                               block[y][x][q] = 0.1 * d2q9_w[q];
 
-       /* except here with 0.2 */
-       for(int q = 0; q < 9; q++)
-               block[0][0][q] = 0.2 * d2q9_w[q];
+       if(core == 0) {
+               /* except here with 0.2 */
+               for(int q = 0; q < 9; q++)
+                       block[0][0][q] = 0.2 * d2q9_w[q];
+       }
 
        return;
 }
 
-void collide_and_swap(d2q9_block_t f, int x, int y, FLOAT omega)
+void d2q9_collide(d2q9_block_t f, int x, int y, FLOAT omega)
 {
        /* macroscopic */
-       FLOAT rho = f[x][y][0] + f[x][y][1] + f[x][y][2] +
-               f[x][y][3] + f[x][y][4] + f[x][y][5] +
-               f[x][y][6] + f[x][y][7] + f[x][y][8];
-       FLOAT ux = (f[x][y][7] + f[x][y][6] + f[x][y][5] -
-               f[x][y][1] - f[x][y][2] - f[x][y][3]) / rho;
-       FLOAT uy = (f[x][y][1] + f[x][y][8] + f[x][y][7] -
-               f[x][y][3] - f[x][y][4] - f[x][y][5]) / rho;
+       FLOAT rho = f[y][x][0] + f[y][x][1] + f[y][x][2] +
+               f[y][x][3] + f[y][x][4] + f[y][x][5] +
+               f[y][x][6] + f[y][x][7] + f[y][x][8];
+       FLOAT ux = (f[y][x][7] + f[y][x][6] + f[y][x][5] -
+               f[y][x][1] - f[y][x][2] - f[y][x][3]) / rho;
+       FLOAT uy = (f[y][x][1] + f[y][x][8] + f[y][x][7] -
+               f[y][x][3] - f[y][x][4] - f[y][x][5]) / rho;
        FLOAT sqr = 1.5 * (ux*ux + uy*uy);
 
        /* update node */
@@ -46,49 +50,64 @@ void collide_and_swap(d2q9_block_t f, int x, int y, FLOAT omega)
                FLOAT cu = ux*d2q9_v[q][0] + uy*d2q9_v[q][1];
                FLOAT eq = rho * d2q9_w[q] *
                        (1. + 3. * cu + 4.5 * cu*cu - sqr);
-               f[x][y][q] *= (1.0 - omega);
-               f[x][y][q] += omega * eq;
+               f[y][x][q] *= (1.0 - omega);
+               f[y][x][q] += omega * eq;
        }
 
        /* swap */
        for(int q = 1; q <= 4; q++) {
-               FLOAT tmp    = f[x][y][q];
-               f[x][y][q]   = f[x][y][q+4];
-               f[x][y][q+4] = tmp;
+               FLOAT tmp    = f[y][x][q];
+               f[y][x][q]   = f[y][x][q+4];
+               f[y][x][q+4] = tmp;
        }
 }
 
-void stream_node(d2q9_block_t f, int x, int y)
+void d2q9_stream(d2q9_block_t f, int x, int y)
 {
        for(int q = 1; q <= 4; q++) {
-               int next_x = x + d2q9_v[q][0];
-               int next_y = y + d2q9_v[q][1];
-
-               /* wrap around */
-               if(next_x < 0)             next_x += BLOCK_X;
-               else if(next_x >= BLOCK_X) next_x -= BLOCK_X;
-               if(next_y < 0)             next_y += BLOCK_Y;
-               else if(next_y >= BLOCK_Y) next_y -= BLOCK_Y;
-
-               FLOAT tmp    = f[x][y][q+4];
-               f[x][y][q+4] = f[next_x][next_y][q];
-               f[next_x][next_y][q] = tmp;
+               int next_row = row;
+               int next_col = col;
+               int next_x   = x + d2q9_v[q][0];
+               int next_y   = y + d2q9_v[q][1];
+
+               /* inner borders (extend) */
+               if(next_x < 0)             { next_col--; next_x += BLOCK_X; }
+               else if(next_x >= BLOCK_X) { next_col++; next_x -= BLOCK_X; }
+               if(next_y < 0)             { next_row--; next_y += BLOCK_Y; }
+               else if(next_y >= BLOCK_Y) { next_row++; next_y -= BLOCK_Y; }
+
+               /* outer borders (wrap around) */
+               if(next_col < 0)             { next_col += CORES_X; }
+               else if(next_col >= CORES_X) { next_col -= CORES_X; }
+               if(next_row < 0)             { next_row += CORES_Y; }
+               else if(next_row >= CORES_Y) { next_row -= CORES_Y; }
+
+               /* f: local block, g: local or remote block */
+               d2q9_block_t *g = (void*)f;
+               if(next_row != row || next_col != col) {
+                       g = e_get_global_address(next_col, next_row, (void*)f);
+               }
+
+               /* stream/swap f and g */
+               FLOAT tmp    = f[y][x][q+4];
+               f[y][x][q+4] = (*g)[next_y][next_x][q];
+               (*g)[next_y][next_x][q] = tmp;
        }
 }
 
-void collide_and_stream_bulk(d2q9_block_t f, FLOAT omega)
+void d2q9_collide_stream_bulk(d2q9_block_t f, FLOAT omega)
 {
        /* don't touch the border nodes */
        for(int x = 1; x < BLOCK_X-1; x++) {
                for(int y = 1; y < BLOCK_Y-1; y++) {
                        /* macroscopic */
-                       FLOAT rho = f[x][y][0] + f[x][y][1] + f[x][y][2] +
-                               f[x][y][3] + f[x][y][4] + f[x][y][5] +
-                               f[x][y][6] + f[x][y][7] + f[x][y][8];
-                       FLOAT ux = (f[x][y][7] + f[x][y][6] + f[x][y][5] -
-                               f[x][y][1] - f[x][y][2] - f[x][y][3]) / rho;
-                       FLOAT uy = (f[x][y][1] + f[x][y][8] + f[x][y][7] -
-                               f[x][y][3] - f[x][y][4] - f[x][y][5]) / rho;
+                       FLOAT rho = f[y][x][0] + f[y][x][1] + f[y][x][2] +
+                               f[y][x][3] + f[y][x][4] + f[y][x][5] +
+                               f[y][x][6] + f[y][x][7] + f[y][x][8];
+                       FLOAT ux = (f[y][x][7] + f[y][x][6] + f[y][x][5] -
+                               f[y][x][1] - f[y][x][2] - f[y][x][3]) / rho;
+                       FLOAT uy = (f[y][x][1] + f[y][x][8] + f[y][x][7] -
+                               f[y][x][3] - f[y][x][4] - f[y][x][5]) / rho;
                        FLOAT sqr = 1.5 * (ux*ux + uy*uy);
 
                        /* update node */
@@ -96,8 +115,8 @@ void collide_and_stream_bulk(d2q9_block_t f, FLOAT omega)
                                FLOAT cu = ux*d2q9_v[q][0] + uy*d2q9_v[q][1];
                                FLOAT eq = rho * d2q9_w[q] *
                                        (1. + 3. * cu + 4.5 * cu*cu - sqr);
-                               f[x][y][q] *= (1.0 - omega);
-                               f[x][y][q] += omega * eq;
+                               f[y][x][q] *= (1.0 - omega);
+                               f[y][x][q] += omega * eq;
                        }
 
                        /* stream */
@@ -105,10 +124,10 @@ void collide_and_stream_bulk(d2q9_block_t f, FLOAT omega)
                                int next_x = x + d2q9_v[q][0];
                                int next_y = y + d2q9_v[q][1];
 
-                               FLOAT tmp     = f[x][y][q];
-                               f[x][y][q]   = f[x][y][q+4];
-                               f[x][y][q+4] = f[next_x][next_y][q];
-                               f[next_x][next_y][q] = tmp;
+                               FLOAT tmp    = f[y][x][q];
+                               f[y][x][q]   = f[y][x][q+4];
+                               f[y][x][q+4] = f[next_y][next_x][q];
+                               f[next_y][next_x][q] = tmp;
                        }
                }
        }
index 9c099bf3587fe6e9549fe2ecb60acd01ec9a36ed..a72b39bbeacc262c07a7a9b99d6ee9485a02249d 100644 (file)
@@ -1,14 +1,13 @@
-/* D2Q9 lattice boltzmann functions */
+/* lattice boltzmann functions */
 
 #include "../shared.h"
 
-void init_block             (d2q9_block_t);
-
-void collide_and_swap       (d2q9_block_t, int x, int y,  FLOAT);
-void stream_node            (d2q9_block_t, int x, int y);
-
-void collide_and_stream_bulk(d2q9_block_t, FLOAT);
-
-
+/* core index */
+extern unsigned int row, col, core;
 
+/* D2Q9 functions */
+void d2q9_init               (d2q9_block_t);
+void d2q9_collide            (d2q9_block_t, int x, int y,  FLOAT);
+void d2q9_stream             (d2q9_block_t, int x, int y);
+void d2q9_collide_stream_bulk(d2q9_block_t, FLOAT);
 
index d58dedf2cc39ddde3389c0314e6410994e08e9c0..094ed79e2778c17f1bf852bf74a3eb9667569fdc 100644 (file)
 /* shared memory overlay */
 volatile shm_t shm SECTION(".shared_dram");
 
-/* statically allocate dummy memory
-   to prevent linker from putting stuff there */
-uint8_t dummy_bank1[8192] SECTION(".data_bank1");
-uint8_t dummy_bank2[8192] SECTION(".data_bank2");
-uint8_t dummy_bank3[8192] SECTION(".data_bank3");
+/* statically allocate dummy memory and local block overlay
+   to prevent linker from putting stuff in banks 1..3 */
+static uint8_t      dummy_bank1[8192] UNUSED SECTION(".data_bank1");
+static uint8_t      dummy_bank2[8192] UNUSED SECTION(".data_bank2");
+static uint8_t      dummy_bank3[8192] UNUSED SECTION(".data_bank3");
+static d2q9_block_t *block = (void*)0x2000;
 
-/* local block, aliased over dummy_bankX */
-d2q9_block_t *block = (void*)0x2000;
+/* barrier structures */
+volatile e_barrier_t  barriers[NUM_CORES];
+         e_barrier_t *tgt_bars[NUM_CORES];
 
-void delay(void)
+/* global index variables */
+unsigned int row, col, core;
+
+void delay(int x)
 {
-       for(volatile int i = 0; i < 1000000; i++)
-               for(volatile int j = 0; j < 10; j++)
+       for(volatile int j = 0; j < x; j++)
+               for(volatile int i = 0; i < 1000000; i++)
                        ;
 }
 
-int main()
+void init(void)
 {
+       /* compile-time checks */
        BUILD_BUG(BLOCK_X * BLOCK_Y * sizeof(d2q9_node_t) > 24*1024);
+       BUILD_BUG(CORES_X > 4 || CORES_Y > 4);
+
+       /* core index */
+       e_coords_from_coreid(e_get_coreid(), &col, &row);
+       core = row * CORES_X + col;
 
+       /* barrier initialization */
+       e_barrier_init(barriers, tgt_bars);
+}
+
+int main()
+{
        const FLOAT omega = 1.0;
 
-       init_block(*block);
+       init();
+       d2q9_init(*block);
 
-       while(1) {
+       for(int i = 0; i < 10000; i++) {
 #if 0
                /* collide all nodes */
-               for(int x = 0; x < BLOCK_X; x++)
-                       for(int y = 0; y < BLOCK_Y; y++)
-                               collide_and_swap(*block, x, y, omega);
+               for(int y = 0; y < BLOCK_Y; y++)
+                       for(int x = 0; x < BLOCK_X; x++)
+                               d2q9_collide(*block, x, y, omega);
 
-               /* XXX synchronize */
+               /* synchronize */
+               e_barrier(barriers, tgt_bars);
 
                /* stream all nodes */
-               for(int x = 0; x < BLOCK_X; x++)
-                       for(int y = 0; y < BLOCK_Y; y++)
-                               stream_node(*block, x, y);
-
-               /* XXX synchronize */
+               for(int y = 0; y < BLOCK_Y; y++)
+                       for(int x = 0; x < BLOCK_X; x++)
+                               d2q9_stream(*block, x, y);
 
 #else
                /* collide boundaries: top, bottom */
                for(int x = 0; x < BLOCK_X; x++) {
-                       collide_and_swap(*block, x, 0,         omega);
-                       collide_and_swap(*block, x, BLOCK_Y-1, omega);
+                       d2q9_collide(*block, x, 0,         omega);
+                       d2q9_collide(*block, x, BLOCK_Y-1, omega);
                }
 
                /* collide boundaries: left, right */
                for(int y = 1; y < BLOCK_Y-1; y++) {
-                       collide_and_swap(*block, 0,         y, omega);
-                       collide_and_swap(*block, BLOCK_X-1, y, omega);
+                       d2q9_collide(*block, 0,         y, omega);
+                       d2q9_collide(*block, BLOCK_X-1, y, omega);
                }
 
-               /* XXX synchronize */
+               /* synchronize */
+               e_barrier(barriers, tgt_bars);
 
                /* collide and stream the bulk */
-               collide_and_stream_bulk(*block, omega);
+               d2q9_collide_stream_bulk(*block, omega);
 
                /* stream the boundaries: left, right */
                for(int x = 0; x < BLOCK_X; x++) {
-                       stream_node(*block, x, 0        );
-                       stream_node(*block, x, BLOCK_Y-1);
+                       d2q9_stream(*block, x, 0        );
+                       d2q9_stream(*block, x, BLOCK_Y-1);
                }
 
                /* stream the boundaries: left, right */
                for(int y = 1; y < BLOCK_Y-1; y++) {
-                       stream_node(*block, 0,         y);
-                       stream_node(*block, BLOCK_X-1, y);
+                       d2q9_stream(*block, 0,         y);
+                       d2q9_stream(*block, BLOCK_X-1, y);
                }
-
-               /* XXX synchronize */
 #endif
 
-
-               /* copy grid to shm */
-               memcpy(&shm.lattice[0], block, sizeof(d2q9_block_t));
+               /* copy grid to shm and synchronize */
+               memcpy(&shm.lattice[row][col], block, sizeof(d2q9_block_t));
+               e_barrier(barriers, tgt_bars);
 
                /* flag host */
-               shm.states[0]++;
+               if(core == 0 && !(i%100)) {
+                       shm.states[row][col]++;
+                       delay(1);
+               }
        }
 
+       shm.states[row][col] = -1;
        while(1);
 }
 
index 971425a9c12fe74f419a630057dae8c19adff35b..1e90eac1e80138694bd8102b7c66295497ae4a66 100644 (file)
@@ -1,7 +1,6 @@
 /* Host Application */
 
 #include <stdio.h>
-#include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
 #include <errno.h>
 #define SHM_OFFSET 0x01000000
 
 static states_t laststates, states;    /* old state value */
-static shm_t    shm = {{ 0 }};         /* local shm copy */
+static shm_t    shm = {{{ 0 }}};       /* local shm copy */
 
-void write_populations(FILE *file, int core, int iter)
+void write_populations(FILE *file, int core_x, int core_y, int iter)
 {
-       for(int x = 0; x < BLOCK_X; x++) {
-               for(int y = 0; y < BLOCK_Y; y++) {
+       for(int y = 0; y < BLOCK_Y; y++) {
+               for(int x = 0; x < BLOCK_X; x++) {
                        fprintf(file, "%03d: [%02d,%02d]: ", iter, x, y);
                        for(int q = 0; q < 9; q++) {
                                fprintf(file, "%.5f\t",
-                                       shm.lattice[core][x][y][q]);
+                                       shm.lattice[core_y][core_x][y][x][q]);
                        }
                        fprintf(file, "\n");
                }
@@ -37,52 +36,49 @@ void write_image(int iter)
 {
        FILE *file; char name[32];
 
-       FLOAT rhos[4][4][BLOCK_X][BLOCK_Y];
-       FLOAT min = 1, max = 0;
-       uint8_t gray;
-
        snprintf(name, 32, "./tmp/i%06d.ppm", iter);
        file = fopen(name, "wb");
        if(!file) exit(-1);
 
-       fprintf(file, "P5\n%d %d\n%d\n", BLOCK_X, BLOCK_Y, 255);
+       fprintf(file, "P5\n%d %d\n%d\n",
+               CORES_X*BLOCK_X, CORES_Y*BLOCK_Y, 255);
 
        /* calculate all densities and remember min/max */
-       int cx = 0, cy = 0;
-//     for(int cy = 0; cy < 1; cy++) {
-//             for(int cx = 0; cx < 1; cx++) {
-                       for(int x = 0; x < BLOCK_X; x++) {
-                               for(int y = 0; y < BLOCK_Y; y++) {
-                                       rhos[cy][cx][x][y] = 0;
-                                       for(int q = 0; q < 9; q++) {
-                                               rhos[cy][cx][x][y] +=
-                                               shm.lattice[cy*4+cx][x][y][q];
-                                       }
-
-                                       if(rhos[cy][cx][x][y] < min)
-                                               min = rhos[cy][cx][x][y];
-                                       if(rhos[cy][cx][x][y] > max)
-                                               max = rhos[cy][cx][x][y];
+       FLOAT min = 1.0, max = 0;
+       FLOAT rhos[CORES_Y][BLOCK_Y][CORES_X][BLOCK_X];
+       for(int cy = 0; cy < CORES_Y; cy++) {
+               for(int y = 0; y < BLOCK_Y; y++) {
+                       for(int cx = 0; cx < CORES_X; cx++) {
+                               for(int x = 0; x < BLOCK_X; x++) {
+                                       FLOAT rho = 0;
+                                       for(int q = 0; q < 9; q++)
+                                               rho += shm.lattice[cy][cx][y][x][q];
+                                       rhos[cy][y][cx][x] = rho;
+
+                                       if(rho < min) min = rho;
+                                       if(rho > max) max = rho;
                                }
                        }
-//             }
-//     }
+               }
+       }
 
        /* now scale values and write to image file */
-//     for(int cy = 0; cy < 4; cy++) {
-//             for(int cx = 0; cx < 4; cx++) {
-                       for(int x = 0; x < BLOCK_X; x++) {
-                               for(int y = 0; y < BLOCK_Y; y++) {
-                                       gray = (int)(255.*(rhos[cy][cx][x][y]-min) / (max-min));
+       for(int cy = 0; cy < CORES_Y; cy++) {
+               for(int y = 0; y < BLOCK_Y; y++) {
+                       for(int cx = 0; cx < CORES_X; cx++) {
+                               for(int x = 0; x < BLOCK_X; x++) {
+                                       unsigned char gray;
+                                       gray = (255. * (rhos[cy][y][cx][x]-min)
+                                               / (max-min));
                                        fwrite(&gray, 1, 1, file);
                                }
                        }
-//             }
-//     }
+               }
+       }
 
        fclose(file);
        if(chown(name, atoi(getenv("SUDO_UID")), atoi(getenv("SUDO_GID")))) {
-               perror("chown");
+               FAIL("Can't chown image!\n");
        }
 
        return;
@@ -94,14 +90,16 @@ int main()
        FILE *datfile; char *datname = "populations.dat";
        int dummy, old0 = 0;
 
+       /* remove old results */
+       dummy = system("rm -f ./tmp/i*.ppm ./tmp/anim.gif populations.dat");
+       (void)dummy;
+
        e_epiphany_t dev;
        e_mem_t      mem;
 
        e_set_host_verbosity(H_D0);
        e_set_loader_verbosity(L_D0);
 
-       dummy = system("rm ./tmp/i*.ppm ./tmp/anim.gif"); (void)dummy;
-
        /* overwrite results file */
        datfile = fopen(datname, "w");
        if(!datfile)
@@ -111,7 +109,7 @@ int main()
        if(e_init(NULL) != E_OK)
                FAIL("Can't init!\n");
        e_reset_system();
-       if(e_open(&dev, 0, 0, 4, 4) != E_OK)
+       if(e_open(&dev, 0, 0, CORES_X, CORES_Y) != E_OK)
                FAIL("Can't open!\n");
        if(e_alloc(&mem, SHM_OFFSET, sizeof(shm_t)) != E_OK)
                FAIL("Can't alloc!\n");
@@ -119,12 +117,19 @@ int main()
                FAIL("Can't clear shm!\n");
 
        /* load programs */
-       if(e_load(filename, &dev, 0, 0, E_TRUE) != E_OK)
-               FAIL("Can't load!\n");
+       printf("Starting cores:\n");
+       for(int y = 0; y < CORES_Y; y++) {
+               for(int x = 0; x < CORES_X; x++) {
+                       printf("(%02d,%02d) ", x, y);
+                       if(e_load(filename, &dev, x, y, E_TRUE) != E_OK)
+                               FAIL("Can't load!\n");
+               }
+               printf("\n");
+       }
 
        /* ================================================================ */
+       printf("Polling shared memory.\n");
        while(1) {
-               printf("Polling shared memory.\n");
 
                while(1) {
                        /* read states */
@@ -146,23 +151,27 @@ int main()
                memcpy(&laststates, &shm, sizeof(states_t));
 
                /* print states */
-               printf("\t0x%08x 0x%08x 0x%08x 0x%08x\n",
-                       states[0],  states[1],  states[2],  states[3]);
-               printf("\t0x%08x 0x%08x 0x%08x 0x%08x\n",
-                       states[4],  states[5],  states[6],  states[7]);
-               printf("\t0x%08x 0x%08x 0x%08x 0x%08x\n",
-                       states[8],  states[9],  states[10], states[11]);
-               printf("\t0x%08x 0x%08x 0x%08x 0x%08x\n",
-                       states[12], states[13], states[14], states[15]);
-
-               /* write populations */
-               if(states[0] != old0) {
-                       write_populations(datfile, 0, states[0]);
-                       write_image(states[0]);
-                       old0 = states[0];
+#if 0
+               for(int y = 0; y < CORES_Y; y++) {
+                       printf("\t");
+                       for(int x = 0; x < CORES_X; x++) {
+                               printf("0x%08x ", states[y][x]);
+                       }
+                       printf("\n");
+               }
+#else
+               printf("0x%08x\r", states[0][0]);
+               fflush(stdout);
+#endif
+
+               /* write data */
+               if(states[0][0] != old0) {
+                       //write_populations(datfile, 0, states[0]);
+                       write_image(states[0][0]);
+                       old0 = states[0][0];
                }
 
-               if(states[0] >= 21) break;
+               if(states[0][0] == -1) break;
        }
        /* ================================================================ */
 
index 7000418d387a9fdd69adc5a48d64bc53175f84db..e752a359350f70da7a60bdbb795c3beff0d4c8c1 100644 (file)
@@ -4,35 +4,36 @@
 
 #include <stdint.h>
 
-/* PACKED is defined for e-gcc, but not for gcc */
+/* preprocessor magic */
+#define BUILD_BUG(c) do { ((void)sizeof(char[1 - 2*!!(c)])); } while(0);
+#define UNUSED __attribute__((unused))
 #ifndef PACKED
 #define PACKED __attribute__((packed))
 #endif /* PACKED */
 
-/* produce compile-time errors if condition is true */
-#define BUILD_BUG(c) do { ((void)sizeof(char[1 - 2*!!(c)])); } while(0);
-
 /* number of cores */
-#define NUM_CORES 16
+#define CORES_X 4
+#define CORES_Y 4
+#define NUM_CORES (CORES_X * CORES_Y)
 
 /* size of per-core subgrid */
-#define BLOCK_X 15
-#define BLOCK_Y 15
+#define BLOCK_X 26
+#define BLOCK_Y 26
 
 /* floating point type */
 typedef float FLOAT;
 
 /* state type */
-typedef uint32_t states_t[NUM_CORES];
+typedef uint32_t states_t[CORES_Y][CORES_X];
 
 /* node and block type (D2Q9) */
 typedef FLOAT       d2q9_node_t[9];
-typedef d2q9_node_t d2q9_block_t[BLOCK_X][BLOCK_Y];
+typedef d2q9_node_t d2q9_block_t[BLOCK_Y][BLOCK_X];
 
 /* shared memory structure */
 typedef struct {
        states_t     states;
-       d2q9_block_t lattice[NUM_CORES];
+       d2q9_block_t lattice[CORES_Y][CORES_X];
 } PACKED shm_t;
 
 #endif /* _SHARED_H_ */