From kde-kimageshop Wed Oct 31 11:29:34 2012 From: Jaroslaw Staniek Date: Wed, 31 Oct 2012 11:29:34 +0000 To: kde-kimageshop Subject: [calligra/kexi-altertable-staniek] krita/plugins/paintops/libbrush: Optimized Auto Brush mask fillin Message-Id: <20121031112934.C3507A60E1 () git ! kde ! org> X-MARC-Message: https://marc.info/?l=kde-kimageshop&m=135168302515156 Git commit 8127e2a70646f67838dc788ae7f5c91a69aafd25 by Jaroslaw Staniek, on behalf of Dmitry Kazakov. Committed on 10/10/2012 at 12:30. Pushed by staniek into branch 'kexi-altertable-staniek'. Optimized Auto Brush mask filling code This patch uses internal cpu parallelism and makes the code execute much faster in the 'KisStrokeBenchmark pixelbrush300pxRL' benchmark. Actual results in the benchmark: Sandy Bridge (Core i7-2600): +25% Merom (Core 2 Duo T7250): +10% According to VTune the painting should have become up to 10% faster (on Sandy Bridge), because now this part of code consumes almost no time. This optimization will work most on the highest precision levels, that is when a dab cannot be cached. CCMAIL:kimageshop@kde.org M +90 -13 krita/plugins/paintops/libbrush/kis_auto_brush.cpp http://commits.kde.org/calligra/8127e2a70646f67838dc788ae7f5c91a69aafd25 diff --git a/krita/plugins/paintops/libbrush/kis_auto_brush.cpp b/krita/plugins/paintops/libbrush/kis_auto_brush.cpp index 4449d9f..34a0b76 100644 --- a/krita/plugins/paintops/libbrush/kis_auto_brush.cpp +++ b/krita/plugins/paintops/libbrush/kis_auto_brush.cpp @@ -237,7 +237,86 @@ KisAutoBrush::~KisAutoBrush() delete d; } +inline void fillPixelOptimized_4bytes(quint8 *color, quint8 *buf, int size) +{ + /** + * This version of filling uses low granularity of data transfers + * (32-bit chunks) and internal processor's parallelism. It reaches + * 25% better performance in KisStrokeBenchmark in comparison to + * per-pixel memcpy version (tested on Sandy Bridge). + */ + + int block1 = size / 8; + int block2 = size % 8; + + quint32 *src = reinterpret_cast(color); + quint32 *dst = reinterpret_cast(buf); + + // check whether all buffers are 4 bytes aligned + // (uncomment if experience some problems) + // Q_ASSERT(((qint64)src & 3) == 0); + // Q_ASSERT(((qint64)dst & 3) == 0); + + for (int i = 0; i < block1; i++) { + *dst = *src; + *(dst+1) = *src; + *(dst+2) = *src; + *(dst+3) = *src; + *(dst+4) = *src; + *(dst+5) = *src; + *(dst+6) = *src; + *(dst+7) = *src; + + dst += 8; + } + + for (int i = 0; i < block2; i++) { + *dst = *src; + dst++; + } +} +inline void fillPixelOptimized_general(quint8 *color, quint8 *buf, int size, int pixelSize) +{ + /** + * This version uses internal processor's parallelism and gives + * 20% better performance in KisStrokeBenchmark in comparison to + * per-pixel memcpy version (tested on Sandy Bridge (+20%) and + * on Merom (+10%)). + */ + + int block1 = size / 8; + int block2 = size % 8; + + for (int i = 0; i < block1; i++) { + quint8 *d1 = buf; + quint8 *d2 = buf + pixelSize; + quint8 *d3 = buf + 2 * pixelSize; + quint8 *d4 = buf + 3 * pixelSize; + quint8 *d5 = buf + 4 * pixelSize; + quint8 *d6 = buf + 5 * pixelSize; + quint8 *d7 = buf + 6 * pixelSize; + quint8 *d8 = buf + 7 * pixelSize; + + for (int j = 0; j < pixelSize; j++) { + *(d1 + j) = color[j]; + *(d2 + j) = color[j]; + *(d3 + j) = color[j]; + *(d4 + j) = color[j]; + *(d5 + j) = color[j]; + *(d6 + j) = color[j]; + *(d7 + j) = color[j]; + *(d8 + j) = color[j]; + } + + buf += 8 * pixelSize; + } + + for (int i = 0; i < block2; i++) { + memcpy(buf, color, pixelSize); + buf += pixelSize; + } +} void KisAutoBrush::generateMaskAndApplyMaskOrCreateDab(KisFixedPaintDeviceSP dst, KisBrush::ColoringInformation* coloringInformation, @@ -299,23 +378,21 @@ void KisAutoBrush::generateMaskAndApplyMaskOrCreateDab(KisFixedPaintDeviceSP dst d->shape->setSoftness( softnessFactor ); - for (int y = 0; y < dstHeight; y++) { - for (int x = 0; x < dstWidth; x++) { - - if (coloringInformation) { - if (color) { - memcpy(dabPointer, color, pixelSize); - } else { + if (coloringInformation) { + if (color && pixelSize == 4) { + fillPixelOptimized_4bytes(color, dabPointer, dstWidth * dstHeight); + } else if (color) { + fillPixelOptimized_general(color, dabPointer, dstWidth * dstHeight, pixelSize); + } else { + for (int y = 0; y < dstHeight; y++) { + for (int x = 0; x < dstWidth; x++) { memcpy(dabPointer, coloringInformation->color(), pixelSize); coloringInformation->nextColumn(); } + coloringInformation->nextRow(); } - dabPointer += pixelSize; - }//endfor x - if (!color && coloringInformation) { - coloringInformation->nextRow(); - } - }//endfor y + } + } MaskProcessor s(dst, cs, d->randomness, d->density, centerX, centerY, invScaleX, invScaleY, angle, d->shape); int jobs = d->idealThreadCountCached; _______________________________________________ kimageshop mailing list kimageshop@kde.org https://mail.kde.org/mailman/listinfo/kimageshop