From: Sebastian <git@sraa.de>
Date: Mon, 30 Jun 2014 22:08:10 +0000 (+0000)
Subject: inner borders
X-Git-Url: http://sraa.de/git/?a=commitdiff_plain;h=033d3e6771ae79e5f817e0d3c0d456e7e35b4656;p=lattice-boltzmann-epiphany.git

inner borders

- implement inner borders, still wrap-around the outer borders
- shm_t uses CORES_X, CORES_Y instead of linearized numbers
- change index order to be [y][x] everywhere
- maximum size now 104x104 (using 26x26 blocks in a 4x4 grid)
- compile-time bombs for anything larger
- finally supports non-square block and grid sizes
---

diff --git a/lb/esrc/d2q9.c b/lb/esrc/d2q9.c
index 22f5f5f..c4ada9d 100644
--- a/lb/esrc/d2q9.c
+++ b/lb/esrc/d2q9.c
@@ -1,6 +1,8 @@
 /* D2Q9 lattice boltzmann functions */
 
+#include <e-lib.h>
 #include "../shared.h"
+#include "lb.h"
 
 /* velocities */
 static const int d2q9_v[9][2] = { { 0, 0},
@@ -14,31 +16,33 @@ static const FLOAT d2q9_w[9] = { 4./9.,
 	1./36., 1./9., 1./36., 1./9.,
 };
 
-void init_block(d2q9_block_t block)
+void d2q9_init(d2q9_block_t block)
 {
 	/* all with rho = 0.1 */
-	for(int x = 0; x < BLOCK_X; x++)
-		for(int y = 0; y < BLOCK_Y; y++)
+	for(int y = 0; y < BLOCK_Y; y++)
+		for(int x = 0; x < BLOCK_X; x++)
 			for(int q = 0; q < 9; q++)
-				block[x][y][q] = 0.1 * d2q9_w[q];
+				block[y][x][q] = 0.1 * d2q9_w[q];
 
-	/* except here with 0.2 */
-	for(int q = 0; q < 9; q++)
-		block[0][0][q] = 0.2 * d2q9_w[q];
+	if(core == 0) {
+		/* except here with 0.2 */
+		for(int q = 0; q < 9; q++)
+			block[0][0][q] = 0.2 * d2q9_w[q];
+	}
 
 	return;
 }
 
-void collide_and_swap(d2q9_block_t f, int x, int y, FLOAT omega)
+void d2q9_collide(d2q9_block_t f, int x, int y, FLOAT omega)
 {
 	/* macroscopic */
-	FLOAT rho = f[x][y][0] + f[x][y][1] + f[x][y][2] +
-		f[x][y][3] + f[x][y][4] + f[x][y][5] +
-		f[x][y][6] + f[x][y][7] + f[x][y][8];
-	FLOAT ux = (f[x][y][7] + f[x][y][6] + f[x][y][5] -
-		f[x][y][1] - f[x][y][2] - f[x][y][3]) / rho;
-	FLOAT uy = (f[x][y][1] + f[x][y][8] + f[x][y][7] -
-		f[x][y][3] - f[x][y][4] - f[x][y][5]) / rho;
+	FLOAT rho = f[y][x][0] + f[y][x][1] + f[y][x][2] +
+		f[y][x][3] + f[y][x][4] + f[y][x][5] +
+		f[y][x][6] + f[y][x][7] + f[y][x][8];
+	FLOAT ux = (f[y][x][7] + f[y][x][6] + f[y][x][5] -
+		f[y][x][1] - f[y][x][2] - f[y][x][3]) / rho;
+	FLOAT uy = (f[y][x][1] + f[y][x][8] + f[y][x][7] -
+		f[y][x][3] - f[y][x][4] - f[y][x][5]) / rho;
 	FLOAT sqr = 1.5 * (ux*ux + uy*uy);
 
 	/* update node */
@@ -46,49 +50,64 @@ void collide_and_swap(d2q9_block_t f, int x, int y, FLOAT omega)
 		FLOAT cu = ux*d2q9_v[q][0] + uy*d2q9_v[q][1];
 		FLOAT eq = rho * d2q9_w[q] *
 			(1. + 3. * cu + 4.5 * cu*cu - sqr);
-		f[x][y][q] *= (1.0 - omega);
-		f[x][y][q] += omega * eq;
+		f[y][x][q] *= (1.0 - omega);
+		f[y][x][q] += omega * eq;
 	}
 
 	/* swap */
 	for(int q = 1; q <= 4; q++) {
-		FLOAT tmp    = f[x][y][q];
-		f[x][y][q]   = f[x][y][q+4];
-		f[x][y][q+4] = tmp;
+		FLOAT tmp    = f[y][x][q];
+		f[y][x][q]   = f[y][x][q+4];
+		f[y][x][q+4] = tmp;
 	}
 }
 
-void stream_node(d2q9_block_t f, int x, int y)
+void d2q9_stream(d2q9_block_t f, int x, int y)
 {
 	for(int q = 1; q <= 4; q++) {
-		int next_x = x + d2q9_v[q][0];
-		int next_y = y + d2q9_v[q][1];
-
-		/* wrap around */
-		if(next_x < 0)             next_x += BLOCK_X;
-		else if(next_x >= BLOCK_X) next_x -= BLOCK_X;
-		if(next_y < 0)             next_y += BLOCK_Y;
-		else if(next_y >= BLOCK_Y) next_y -= BLOCK_Y;
-
-		FLOAT tmp    = f[x][y][q+4];
-		f[x][y][q+4] = f[next_x][next_y][q];
-		f[next_x][next_y][q] = tmp;
+		int next_row = row;
+		int next_col = col;
+		int next_x   = x + d2q9_v[q][0];
+		int next_y   = y + d2q9_v[q][1];
+
+		/* inner borders (extend) */
+		if(next_x < 0)             { next_col--; next_x += BLOCK_X; }
+		else if(next_x >= BLOCK_X) { next_col++; next_x -= BLOCK_X; }
+		if(next_y < 0)             { next_row--; next_y += BLOCK_Y; }
+		else if(next_y >= BLOCK_Y) { next_row++; next_y -= BLOCK_Y; }
+
+		/* outer borders (wrap around) */
+		if(next_col < 0)             { next_col += CORES_X; }
+		else if(next_col >= CORES_X) { next_col -= CORES_X; }
+		if(next_row < 0)             { next_row += CORES_Y; }
+		else if(next_row >= CORES_Y) { next_row -= CORES_Y; }
+
+		/* f: local block, g: local or remote block */
+		d2q9_block_t *g = (void*)f;
+		if(next_row != row || next_col != col) {
+			g = e_get_global_address(next_col, next_row, (void*)f);
+		}
+
+		/* stream/swap f and g */
+		FLOAT tmp    = f[y][x][q+4];
+		f[y][x][q+4] = (*g)[next_y][next_x][q];
+		(*g)[next_y][next_x][q] = tmp;
 	}
 }
 
-void collide_and_stream_bulk(d2q9_block_t f, FLOAT omega)
+void d2q9_collide_stream_bulk(d2q9_block_t f, FLOAT omega)
 {
 	/* don't touch the border nodes */
 	for(int x = 1; x < BLOCK_X-1; x++) {
 		for(int y = 1; y < BLOCK_Y-1; y++) {
 			/* macroscopic */
-			FLOAT rho = f[x][y][0] + f[x][y][1] + f[x][y][2] +
-				f[x][y][3] + f[x][y][4] + f[x][y][5] +
-				f[x][y][6] + f[x][y][7] + f[x][y][8];
-			FLOAT ux = (f[x][y][7] + f[x][y][6] + f[x][y][5] -
-				f[x][y][1] - f[x][y][2] - f[x][y][3]) / rho;
-			FLOAT uy = (f[x][y][1] + f[x][y][8] + f[x][y][7] -
-				f[x][y][3] - f[x][y][4] - f[x][y][5]) / rho;
+			FLOAT rho = f[y][x][0] + f[y][x][1] + f[y][x][2] +
+				f[y][x][3] + f[y][x][4] + f[y][x][5] +
+				f[y][x][6] + f[y][x][7] + f[y][x][8];
+			FLOAT ux = (f[y][x][7] + f[y][x][6] + f[y][x][5] -
+				f[y][x][1] - f[y][x][2] - f[y][x][3]) / rho;
+			FLOAT uy = (f[y][x][1] + f[y][x][8] + f[y][x][7] -
+				f[y][x][3] - f[y][x][4] - f[y][x][5]) / rho;
 			FLOAT sqr = 1.5 * (ux*ux + uy*uy);
 
 			/* update node */
@@ -96,8 +115,8 @@ void collide_and_stream_bulk(d2q9_block_t f, FLOAT omega)
 				FLOAT cu = ux*d2q9_v[q][0] + uy*d2q9_v[q][1];
 				FLOAT eq = rho * d2q9_w[q] *
 					(1. + 3. * cu + 4.5 * cu*cu - sqr);
-				f[x][y][q] *= (1.0 - omega);
-				f[x][y][q] += omega * eq;
+				f[y][x][q] *= (1.0 - omega);
+				f[y][x][q] += omega * eq;
 			}
 
 			/* stream */
@@ -105,10 +124,10 @@ void collide_and_stream_bulk(d2q9_block_t f, FLOAT omega)
 				int next_x = x + d2q9_v[q][0];
 				int next_y = y + d2q9_v[q][1];
 
-				FLOAT tmp     = f[x][y][q];
-				f[x][y][q]   = f[x][y][q+4];
-				f[x][y][q+4] = f[next_x][next_y][q];
-				f[next_x][next_y][q] = tmp;
+				FLOAT tmp    = f[y][x][q];
+				f[y][x][q]   = f[y][x][q+4];
+				f[y][x][q+4] = f[next_y][next_x][q];
+				f[next_y][next_x][q] = tmp;
 			}
 		}
 	}
diff --git a/lb/esrc/lb.h b/lb/esrc/lb.h
index 9c099bf..a72b39b 100644
--- a/lb/esrc/lb.h
+++ b/lb/esrc/lb.h
@@ -1,14 +1,13 @@
-/* D2Q9 lattice boltzmann functions */
+/* lattice boltzmann functions */
 
 #include "../shared.h"
 
-void init_block             (d2q9_block_t);
-
-void collide_and_swap       (d2q9_block_t, int x, int y,  FLOAT);
-void stream_node            (d2q9_block_t, int x, int y);
-
-void collide_and_stream_bulk(d2q9_block_t, FLOAT);
-
-
+/* core index */
+extern unsigned int row, col, core;
 
+/* D2Q9 functions */
+void d2q9_init               (d2q9_block_t);
+void d2q9_collide            (d2q9_block_t, int x, int y,  FLOAT);
+void d2q9_stream             (d2q9_block_t, int x, int y);
+void d2q9_collide_stream_bulk(d2q9_block_t, FLOAT);
 
diff --git a/lb/esrc/lb_2d.c b/lb/esrc/lb_2d.c
index d58dedf..094ed79 100644
--- a/lb/esrc/lb_2d.c
+++ b/lb/esrc/lb_2d.c
@@ -11,87 +11,107 @@
 /* shared memory overlay */
 volatile shm_t shm SECTION(".shared_dram");
 
-/* statically allocate dummy memory
-   to prevent linker from putting stuff there */
-uint8_t dummy_bank1[8192] SECTION(".data_bank1");
-uint8_t dummy_bank2[8192] SECTION(".data_bank2");
-uint8_t dummy_bank3[8192] SECTION(".data_bank3");
+/* statically allocate dummy memory and local block overlay
+   to prevent linker from putting stuff in banks 1..3 */
+static uint8_t      dummy_bank1[8192] UNUSED SECTION(".data_bank1");
+static uint8_t      dummy_bank2[8192] UNUSED SECTION(".data_bank2");
+static uint8_t      dummy_bank3[8192] UNUSED SECTION(".data_bank3");
+static d2q9_block_t *block = (void*)0x2000;
 
-/* local block, aliased over dummy_bankX */
-d2q9_block_t *block = (void*)0x2000;
+/* barrier structures */
+volatile e_barrier_t  barriers[NUM_CORES];
+         e_barrier_t *tgt_bars[NUM_CORES];
 
-void delay(void)
+/* global index variables */
+unsigned int row, col, core;
+
+void delay(int x)
 {
-	for(volatile int i = 0; i < 1000000; i++)
-		for(volatile int j = 0; j < 10; j++)
+	for(volatile int j = 0; j < x; j++)
+		for(volatile int i = 0; i < 1000000; i++)
 			;
 }
 
-int main()
+void init(void)
 {
+	/* compile-time checks */
 	BUILD_BUG(BLOCK_X * BLOCK_Y * sizeof(d2q9_node_t) > 24*1024);
+	BUILD_BUG(CORES_X > 4 || CORES_Y > 4);
+
+	/* core index */
+	e_coords_from_coreid(e_get_coreid(), &col, &row);
+	core = row * CORES_X + col;
 
+	/* barrier initialization */
+	e_barrier_init(barriers, tgt_bars);
+}
+
+int main()
+{
 	const FLOAT omega = 1.0;
 
-	init_block(*block);
+	init();
+	d2q9_init(*block);
 
-	while(1) {
+	for(int i = 0; i < 10000; i++) {
 #if 0
 		/* collide all nodes */
-		for(int x = 0; x < BLOCK_X; x++)
-			for(int y = 0; y < BLOCK_Y; y++)
-				collide_and_swap(*block, x, y, omega);
+		for(int y = 0; y < BLOCK_Y; y++)
+			for(int x = 0; x < BLOCK_X; x++)
+				d2q9_collide(*block, x, y, omega);
 
-		/* XXX synchronize */
+		/* synchronize */
+		e_barrier(barriers, tgt_bars);
 
 		/* stream all nodes */
-		for(int x = 0; x < BLOCK_X; x++)
-			for(int y = 0; y < BLOCK_Y; y++)
-				stream_node(*block, x, y);
-
-		/* XXX synchronize */
+		for(int y = 0; y < BLOCK_Y; y++)
+			for(int x = 0; x < BLOCK_X; x++)
+				d2q9_stream(*block, x, y);
 
 #else
 		/* collide boundaries: top, bottom */
 		for(int x = 0; x < BLOCK_X; x++) {
-			collide_and_swap(*block, x, 0,         omega);
-			collide_and_swap(*block, x, BLOCK_Y-1, omega);
+			d2q9_collide(*block, x, 0,         omega);
+			d2q9_collide(*block, x, BLOCK_Y-1, omega);
 		}
 
 		/* collide boundaries: left, right */
 		for(int y = 1; y < BLOCK_Y-1; y++) {
-			collide_and_swap(*block, 0,         y, omega);
-			collide_and_swap(*block, BLOCK_X-1, y, omega);
+			d2q9_collide(*block, 0,         y, omega);
+			d2q9_collide(*block, BLOCK_X-1, y, omega);
 		}
 
-		/* XXX synchronize */
+		/* synchronize */
+		e_barrier(barriers, tgt_bars);
 
 		/* collide and stream the bulk */
-		collide_and_stream_bulk(*block, omega);
+		d2q9_collide_stream_bulk(*block, omega);
 
 		/* stream the boundaries: left, right */
 		for(int x = 0; x < BLOCK_X; x++) {
-			stream_node(*block, x, 0        );
-			stream_node(*block, x, BLOCK_Y-1);
+			d2q9_stream(*block, x, 0        );
+			d2q9_stream(*block, x, BLOCK_Y-1);
 		}
 
 		/* stream the boundaries: left, right */
 		for(int y = 1; y < BLOCK_Y-1; y++) {
-			stream_node(*block, 0,         y);
-			stream_node(*block, BLOCK_X-1, y);
+			d2q9_stream(*block, 0,         y);
+			d2q9_stream(*block, BLOCK_X-1, y);
 		}
-
-		/* XXX synchronize */
 #endif
 
-
-		/* copy grid to shm */
-		memcpy(&shm.lattice[0], block, sizeof(d2q9_block_t));
+		/* copy grid to shm and synchronize */
+		memcpy(&shm.lattice[row][col], block, sizeof(d2q9_block_t));
+		e_barrier(barriers, tgt_bars);
 
 		/* flag host */
-		shm.states[0]++;
+		if(core == 0 && !(i%100)) {
+			shm.states[row][col]++;
+			delay(1);
+		}
 	}
 
+	shm.states[row][col] = -1;
 	while(1);
 }
 
diff --git a/lb/hsrc/main.c b/lb/hsrc/main.c
index 971425a..1e90eac 100644
--- a/lb/hsrc/main.c
+++ b/lb/hsrc/main.c
@@ -1,7 +1,6 @@
 /* Host Application */
 
 #include <stdio.h>
-#include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
 #include <errno.h>
@@ -14,16 +13,16 @@
 #define SHM_OFFSET 0x01000000
 
 static states_t laststates, states;	/* old state value */
-static shm_t    shm = {{ 0 }};		/* local shm copy */
+static shm_t    shm = {{{ 0 }}};	/* local shm copy */
 
-void write_populations(FILE *file, int core, int iter)
+void write_populations(FILE *file, int core_x, int core_y, int iter)
 {
-	for(int x = 0; x < BLOCK_X; x++) {
-		for(int y = 0; y < BLOCK_Y; y++) {
+	for(int y = 0; y < BLOCK_Y; y++) {
+		for(int x = 0; x < BLOCK_X; x++) {
 			fprintf(file, "%03d: [%02d,%02d]: ", iter, x, y);
 			for(int q = 0; q < 9; q++) {
 				fprintf(file, "%.5f\t",
-					shm.lattice[core][x][y][q]);
+					shm.lattice[core_y][core_x][y][x][q]);
 			}
 			fprintf(file, "\n");
 		}
@@ -37,52 +36,49 @@ void write_image(int iter)
 {
 	FILE *file; char name[32];
 
-	FLOAT rhos[4][4][BLOCK_X][BLOCK_Y];
-	FLOAT min = 1, max = 0;
-	uint8_t gray;
-
 	snprintf(name, 32, "./tmp/i%06d.ppm", iter);
 	file = fopen(name, "wb");
 	if(!file) exit(-1);
 
-	fprintf(file, "P5\n%d %d\n%d\n", BLOCK_X, BLOCK_Y, 255);
+	fprintf(file, "P5\n%d %d\n%d\n",
+		CORES_X*BLOCK_X, CORES_Y*BLOCK_Y, 255);
 
 	/* calculate all densities and remember min/max */
-	int cx = 0, cy = 0;
-//	for(int cy = 0; cy < 1; cy++) {
-//		for(int cx = 0; cx < 1; cx++) {
-			for(int x = 0; x < BLOCK_X; x++) {
-				for(int y = 0; y < BLOCK_Y; y++) {
-					rhos[cy][cx][x][y] = 0;
-					for(int q = 0; q < 9; q++) {
-						rhos[cy][cx][x][y] +=
-						shm.lattice[cy*4+cx][x][y][q];
-					}
-
-					if(rhos[cy][cx][x][y] < min)
-						min = rhos[cy][cx][x][y];
-					if(rhos[cy][cx][x][y] > max)
-						max = rhos[cy][cx][x][y];
+	FLOAT min = 1.0, max = 0;
+	FLOAT rhos[CORES_Y][BLOCK_Y][CORES_X][BLOCK_X];
+	for(int cy = 0; cy < CORES_Y; cy++) {
+		for(int y = 0; y < BLOCK_Y; y++) {
+			for(int cx = 0; cx < CORES_X; cx++) {
+				for(int x = 0; x < BLOCK_X; x++) {
+					FLOAT rho = 0;
+					for(int q = 0; q < 9; q++)
+						rho += shm.lattice[cy][cx][y][x][q];
+					rhos[cy][y][cx][x] = rho;
+
+					if(rho < min) min = rho;
+					if(rho > max) max = rho;
 				}
 			}
-//		}
-//	}
+		}
+	}
 
 	/* now scale values and write to image file */
-//	for(int cy = 0; cy < 4; cy++) {
-//		for(int cx = 0; cx < 4; cx++) {
-			for(int x = 0; x < BLOCK_X; x++) {
-				for(int y = 0; y < BLOCK_Y; y++) {
-					gray = (int)(255.*(rhos[cy][cx][x][y]-min) / (max-min));
+	for(int cy = 0; cy < CORES_Y; cy++) {
+		for(int y = 0; y < BLOCK_Y; y++) {
+			for(int cx = 0; cx < CORES_X; cx++) {
+				for(int x = 0; x < BLOCK_X; x++) {
+					unsigned char gray;
+					gray = (255. * (rhos[cy][y][cx][x]-min)
+						/ (max-min));
 					fwrite(&gray, 1, 1, file);
 				}
 			}
-//		}
-//	}
+		}
+	}
 
 	fclose(file);
 	if(chown(name, atoi(getenv("SUDO_UID")), atoi(getenv("SUDO_GID")))) {
-		perror("chown");
+		FAIL("Can't chown image!\n");
 	}
 
 	return;
@@ -94,14 +90,16 @@ int main()
 	FILE *datfile; char *datname = "populations.dat";
 	int dummy, old0 = 0;
 
+	/* remove old results */
+	dummy = system("rm -f ./tmp/i*.ppm ./tmp/anim.gif populations.dat");
+	(void)dummy;
+
 	e_epiphany_t dev;
 	e_mem_t      mem;
 
 	e_set_host_verbosity(H_D0);
 	e_set_loader_verbosity(L_D0);
 
-	dummy = system("rm ./tmp/i*.ppm ./tmp/anim.gif"); (void)dummy;
-
 	/* overwrite results file */
 	datfile = fopen(datname, "w");
 	if(!datfile)
@@ -111,7 +109,7 @@ int main()
 	if(e_init(NULL) != E_OK)
 		FAIL("Can't init!\n");
 	e_reset_system();
-	if(e_open(&dev, 0, 0, 4, 4) != E_OK)
+	if(e_open(&dev, 0, 0, CORES_X, CORES_Y) != E_OK)
 		FAIL("Can't open!\n");
 	if(e_alloc(&mem, SHM_OFFSET, sizeof(shm_t)) != E_OK)
 		FAIL("Can't alloc!\n");
@@ -119,12 +117,19 @@ int main()
 		FAIL("Can't clear shm!\n");
 
 	/* load programs */
-	if(e_load(filename, &dev, 0, 0, E_TRUE) != E_OK)
-		FAIL("Can't load!\n");
+	printf("Starting cores:\n");
+	for(int y = 0; y < CORES_Y; y++) {
+		for(int x = 0; x < CORES_X; x++) {
+			printf("(%02d,%02d) ", x, y);
+			if(e_load(filename, &dev, x, y, E_TRUE) != E_OK)
+				FAIL("Can't load!\n");
+		}
+		printf("\n");
+	}
 
 	/* ================================================================ */
+	printf("Polling shared memory.\n");
 	while(1) {
-		printf("Polling shared memory.\n");
 
 		while(1) {
 			/* read states */
@@ -146,23 +151,27 @@ int main()
 		memcpy(&laststates, &shm, sizeof(states_t));
 
 		/* print states */
-		printf("\t0x%08x 0x%08x 0x%08x 0x%08x\n",
-			states[0],  states[1],  states[2],  states[3]);
-		printf("\t0x%08x 0x%08x 0x%08x 0x%08x\n",
-			states[4],  states[5],  states[6],  states[7]);
-		printf("\t0x%08x 0x%08x 0x%08x 0x%08x\n",
-			states[8],  states[9],  states[10], states[11]);
-		printf("\t0x%08x 0x%08x 0x%08x 0x%08x\n",
-			states[12], states[13], states[14], states[15]);
-
-		/* write populations */
-		if(states[0] != old0) {
-			write_populations(datfile, 0, states[0]);
-			write_image(states[0]);
-			old0 = states[0];
+#if 0
+		for(int y = 0; y < CORES_Y; y++) {
+			printf("\t");
+			for(int x = 0; x < CORES_X; x++) {
+				printf("0x%08x ", states[y][x]);
+			}
+			printf("\n");
+		}
+#else
+		printf("0x%08x\r", states[0][0]);
+		fflush(stdout);
+#endif
+
+		/* write data */
+		if(states[0][0] != old0) {
+			//write_populations(datfile, 0, states[0]);
+			write_image(states[0][0]);
+			old0 = states[0][0];
 		}
 
-		if(states[0] >= 21) break;
+		if(states[0][0] == -1) break;
 	}
 	/* ================================================================ */
 
diff --git a/lb/shared.h b/lb/shared.h
index 7000418..e752a35 100644
--- a/lb/shared.h
+++ b/lb/shared.h
@@ -4,35 +4,36 @@
 
 #include <stdint.h>
 
-/* PACKED is defined for e-gcc, but not for gcc */
+/* preprocessor magic */
+#define BUILD_BUG(c) do { ((void)sizeof(char[1 - 2*!!(c)])); } while(0);
+#define UNUSED __attribute__((unused))
 #ifndef PACKED
 #define PACKED __attribute__((packed))
 #endif /* PACKED */
 
-/* produce compile-time errors if condition is true */
-#define BUILD_BUG(c) do { ((void)sizeof(char[1 - 2*!!(c)])); } while(0);
-
 /* number of cores */
-#define NUM_CORES 16
+#define CORES_X 4
+#define CORES_Y 4
+#define NUM_CORES (CORES_X * CORES_Y)
 
 /* size of per-core subgrid */
-#define BLOCK_X 15
-#define BLOCK_Y 15
+#define BLOCK_X 26
+#define BLOCK_Y 26
 
 /* floating point type */
 typedef float FLOAT;
 
 /* state type */
-typedef uint32_t states_t[NUM_CORES];
+typedef uint32_t states_t[CORES_Y][CORES_X];
 
 /* node and block type (D2Q9) */
 typedef FLOAT       d2q9_node_t[9];
-typedef d2q9_node_t d2q9_block_t[BLOCK_X][BLOCK_Y];
+typedef d2q9_node_t d2q9_block_t[BLOCK_Y][BLOCK_X];
 
 /* shared memory structure */
 typedef struct {
 	states_t     states;
-	d2q9_block_t lattice[NUM_CORES];
+	d2q9_block_t lattice[CORES_Y][CORES_X];
 } PACKED shm_t;
 
 #endif /* _SHARED_H_ */