X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=sor.frag;h=ef431d3346d96be4e0270ddd6ad4c9ebd1664a63;hb=40043846b1a598b9819a03bcf0c6bda4a43aa7f9;hp=e1f86bbbfea8c3f6f019857369fe220eb69e5e4c;hpb=6911c97816e4dde727ed3aa0fa0631c4047d2bd6;p=nageru diff --git a/sor.frag b/sor.frag index e1f86bb..ef431d3 100644 --- a/sor.frag +++ b/sor.frag @@ -45,8 +45,9 @@ void main() // just immediately throws away half of the warp, but it helps convergence // a _lot_ (rough testing indicates that five iterations of SOR is as good // as ~50 iterations of Jacobi). We could probably do better by reorganizing - // the data into two-values-per-pixel, so-called “twinning buffering”, - // but it makes for rather annoying code in the rest of the pipeline. + // the data into two-values-per-pixel, so-called “twinned buffering”; + // seemingly, it helps Haswell by ~15% on the SOR code, but GTX 950 not at all + // (at least not on 720p). Presumably the latter is already bandwidth bound. int color = int(round(element_sum_idx)) & 1; if (color != phase) discard;