As in the comment, the precision measured against stdlib (on PC) is ±0.003108. I only kept the 0.0001 multiplier because that’s how it was before, and is already plenty of precision. But that’s a nice idea using ldexp. I’ll do some speed testing and see which way would be faster.
The array has 257 entries because otherwise the wave has to turn around before it reaches the exact 1.0 at pi/2. The previous entry rounds up to 1.0 anyway, but if you generated the array with higher precision you would see that it is actually slightly below 1. You can change the generation code to sin(i * M_PI/2 / 255) * 10000 + 0.5
to fit in 256 entries and still retain ±0.003109 precision, but the lookup code becomes:
unsigned short sine_array[256] = {0,62,123,185,246,308,370,431,493,554,616,677,739,800,861,923,984,1045,1107,1168,1229,1290,1351,1412,1473,1534,1595,1656,1716,1777,1837,1898,1958,2019,2079,2139,2199,2260,2319,2379,2439,2499,2558,2618,2677,2737,2796,2855,2914,2973,3032,3090,3149,3207,3265,3324,3382,3439,3497,3555,3612,3670,3727,3784,3841,3898,3955,4011,4067,4124,4180,4235,4291,4347,4402,4457,4512,4567,4622,4677,4731,4785,4839,4893,4947,5000,5053,5106,5159,5212,5264,5317,5369,5421,5472,5524,5575,5626,5677,5727,5778,5828,5878,5928,5977,6026,6075,6124,6173,6221,6269,6317,6365,6412,6459,6506,6553,6599,6645,6691,6737,6782,6827,6872,6917,6961,7005,7049,7093,7136,7179,7222,7264,7307,7348,7390,7431,7473,7513,7554,7594,7634,7674,7713,7752,7791,7829,7867,7905,7943,7980,8017,8054,8090,8126,8162,8197,8233,8267,8302,8336,8370,8403,8437,8470,8502,8534,8566,8598,8629,8660,8691,8721,8751,8781,8810,8839,8868,8896,8924,8952,8979,9006,9032,9059,9085,9110,9135,9160,9185,9209,9233,9256,9280,9302,9325,9347,9369,9390,9411,9432,9452,9472,9491,9511,9529,9548,9566,9584,9601,9618,9635,9651,9667,9683,9698,9713,9727,9741,9755,9768,9781,9794,9806,9818,9830,9841,9852,9862,9872,9882,9891,9900,9908,9916,9924,9932,9939,9945,9951,9957,9963,9968,9973,9977,9981,9985,9988,9991,9993,9995,9997,9998,9999,10000,10000};
void _sincos(float a, float *sin, float *cos) {
unsigned int i = ((unsigned int)(a * (255*8 / (M_PI*2)) + 1) >> 1) & 0x3ff;
if (i < 255) {
*sin = 0.0001f*sine_array[i];
*cos = 0.0001f*sine_array[255 - i];
}
else if(i < 511) {
*sin = 0.0001f*sine_array[510 - i];
*cos = -0.0001f*sine_array[-255 + i];
}
else if(i < 766) {
*sin = -0.0001f*sine_array[-510 + i];
*cos = -0.0001f*sine_array[765 - i];
}
else {
*sin = -0.0001f*sine_array[1020 - i];
*cos = 0.0001f*sine_array[-765 + i];
}
}
On ARM those constants can’t fit in an 8-bit shifted immediate value so it will be slower and take more than 2 bytes of additional code space to store them. I’m less knowledgeable about the details of compiled code for AVR, so I’m not sure if there would be any difference on it.
You could certainly save some space by skipping the combined _sincos. And the simplified _cos will save a bit compared to the old one. The only way around using a 256 (or 257) entry table would be to interpolate between entries. Then 64 would probably be ok. I’ll give it a try.
It certainly won’t overflow Lepton, since the array is actually 286 bytes smaller with the proper data type now 
EDIT: Yep, that works
Actually better precision, ±0.000243. And the constants all easily fit into 8 bits now, so you could use a 64 entry table if that 65th one really bothers you…
uint16_t sine_array[65] = {0,245,491,736,980,1224,1467,1710,1951,2191,2430,2667,2903,3137,3369,3599,3827,4052,4276,4496,4714,4929,5141,5350,5556,5758,5957,6152,6344,6532,6716,6895,7071,7242,7410,7572,7730,7883,8032,8176,8315,8449,8577,8701,8819,8932,9040,9142,9239,9330,9415,9495,9569,9638,9700,9757,9808,9853,9892,9925,9952,9973,9988,9997,10000};
uint16_t lerp(uint16_t a, uint16_t b, uint16_t t) { return a + (((b - a) * t) >> 8); }
float _sin(float a) {
unsigned int i = (unsigned int)(a * 65536 / (M_PI*2));
unsigned int frac = i & 0xff;
i = (i >> 8) & 0xff;
if (i < 64)
return 0.0001f*lerp(sine_array[i], sine_array[i + 1], frac);
else if(i < 128)
return 0.0001f*lerp(sine_array[128 - i], sine_array[127 - i], frac);
else if(i < 192)
return -0.0001f*lerp(sine_array[-128 + i], sine_array[-127 + i], frac);
else
return -0.0001f*lerp(sine_array[256 - i], sine_array[255 - i], frac);
}
EDIT2: Alternative style. And even better precision, ±0.000161 using for(int i = 0; i <= 64; i++) fprintf(file, "%i,", (int)(sin((i * M_PI/2) / 64) * 32768 + 0.5));
and ldexp.
uint16_t sine_array[65] = {0,804,1608,2411,3212,4011,4808,5602,6393,7180,7962,8740,9512,10279,11039,11793,12540,13279,14010,14733,15447,16151,16846,17531,18205,18868,19520,20160,20788,21403,22006,22595,23170,23732,24279,24812,25330,25833,26320,26791,27246,27684,28106,28511,28899,29269,29622,29957,30274,30572,30853,31114,31357,31581,31786,31972,32138,32286,32413,32522,32610,32679,32729,32758,32768};
float _sin(float angle) {
int i = (int)(angle * 65536 / (M_PI*2));
int a, b, frac = i & 0xff;
i = (i >> 8) & 0xff;
if (i < 64)
a = sine_array[i], b = sine_array[i + 1];
else if(i < 128)
a = sine_array[128 - i], b = sine_array[127 - i];
else if(i < 192)
a = -sine_array[-128 + i], b = -sine_array[-127 + i];
else
a = -sine_array[256 - i], b = -sine_array[255 - i];
return ldexp((a + (((b - a) * frac) >> 8)), -15);
}
EDIT3: lol, even an 8+1 entry table matches the precision of the original, ±0.00489. I’ll go with 16+1, precision ±0.00129. Maybe now we’ll have space for _sincos.
uint16_t sine_array[17] = {0,3212,6393,9512,12540,15447,18205,20788,23170,25330,27246,28899,30274,31357,32138,32610,32768};
float _sin(float angle) {
int i = (int)(angle * 4*16*256 / (M_PI*2)); // 4 quadrants, 16 points, 256 fraction
int a, b, frac = i & 0xff;
i = (i >> 8) & 0x3f;
if (i < 16)
a = sine_array[i], b = sine_array[i + 1];
else if(i < 32)
a = sine_array[32 - i], b = sine_array[31 - i];
else if(i < 48)
a = -sine_array[-32 + i], b = -sine_array[-31 + i];
else
a = -sine_array[64 - i], b = -sine_array[63 - i];
return ldexp((a + (((b - a) * frac) >> 8)), -15);
}
float _sincos(float angle, float *s, float *c) {
int i = (int)(angle * 4*16*256 / (M_PI*2));
int sa, sb, ca, cb, frac = i & 0xff;
i = (i >> 8) & 0x3f;
if (i < 16) {
sa = sine_array[i], sb = sine_array[i + 1];
ca = sine_array[16 - i], cb = sine_array[15 - i];
}
else if(i < 32) {
sa = sine_array[32 - i], sb = sine_array[31 - i];
ca = -sine_array[-16 + i], cb = -sine_array[-15 + i];
}
else if(i < 48) {
sa = -sine_array[-32 + i], sb = -sine_array[-31 + i];
ca = -sine_array[48 - i], cb = -sine_array[47 - i];
}
else {
sa = -sine_array[64 - i], sb = -sine_array[63 - i];
ca = sine_array[-48 + i], cb = sine_array[-47 + i];
}
*s = ldexp((sa + (((sb - sa) * frac) >> 8)), -15);
*c = ldexp((ca + (((cb - ca) * frac) >> 8)), -15);
}