つれづれ～努力と根性？～: いろいろ新しくしたら速くなってきた

MPEG2DecPlusとUnsharpHQを新しくして、dfttestのdither_Cをもうちょっとチューニングしたら

MPEG2DecPlus_MPEG2Source(vsource,idct=4)
TDout =ScriptName+"_TDmetrics2.txt"
TFMout =ScriptName+"_TFMout2.txt"
TFM(mode=1,order=-1,PP=1,slow=2,input=TFMout,batch=true)
TDecimate(mode=1,tfmIn=TFMout,input=TDout,batch=true)
FluxSmoothT(temporal_threshold=3)
Spline144Resize(resizex,resizey)
block=32
dfttest(ftype=0,f0beta=1,sigma=4,sbsize=block,sosize=block/2,smode=1,swin=0,tbsize=1,tmode=0,twin=7,dither=1,opt=0)
unsharpHQ_v05_x86_unsharpHQ(SHARPSTR=2.4,THRESHOLD=40,SMOOTH=0) #unsharpHQ(STR=1.0)

というスクリプト(リサイズは1440x1080→1280x720)で28fps程度出るようになりました。(i7-4790@4.5GHz)
最適化対象はAVXで、AVX2用(FMA3を使う)にするともうちょっと(0.5%位？)早くなります。

何にせよ、ようやく実エンコード環境(x264と同時に走らせて)で24fpsを超えることができました(24.5fps)。
事前にTIVTCのhintsを求めてるため、本当のリアルタイムエンコードはできませんが(というかもっと時間がかかりますが)、

・・・あれ？
svml_dispmd.dllが見つかりませんとか言われる・・

確認していったら新しくしたNNEDI3_v0_9_4_47のnnedi3.dllがリンクしてるっぽい。
ってかintelってintelコンパイラか・・・
というわけでNNEDI3のノーマル版を入れたら治りました。
(ちゃんとreadmeに書いてあった・・・
Get Intel Parallel Studio 2017 redistribuables to use Intel versions.
https://software.intel.com/en-us/articles/intelr-composer-redistributable-libraries-by-version)

ついでにFluxSmoothを速くできるんじゃね？と思って探したらSSE化している人がいた！
・・・と思ったけど、x64専用になってる・・・
xmmも8以降がまんべんなく使われていて単純に32bit化するのは面倒そう・・・
いや、そもそも全部64bit化すればいいのか？とか悩みちゅう。

作ったdfttestはバグがありそうですし例によってバイナリ配布とかは面倒なので適当にdfttest.cのdiffから作ってください・・・↓
324a325,326
>         mulps xmm0,[edi+ebx*4]
>         mulps xmm1,[edi+ebx*4+16]
327d328
<         mulps xmm0, [edi+ebx*4]
329,330d329
<         //vfmadd132ps xmm0,xmm2, [edi+ebx*4]    FMA3有効時(≒AVX2)
<         mulps xmm1,[edi+ebx*4+16]
332d330
<         //vfmadd132ps xmm1, xmm3, [edi+ebx*4+16]
596,598c594,595
<         memset(dn, 0, width*sizeof(float));
<
<         for (int x = 0; x < src_width; ++x)
---
>         memset(dn,0,width*sizeof(float));
>         for (int x=0; x<src_width; ++x)
602,604c599,601
<                 (int)(p[x]+dc[x]+0.5f+mtr.randf()*scale-off);
<             dst[x] = min(max(v, 0), 255);
<             const float qerror = p[x] - dst[x];
---
>                 (int)(p[x]+mtr.randf()*scale-off+dc[x]+0.5f);
>             dst[x] = min(max(v,0),255);
>             const float qerror = p[x]-dst[x];
606c603
<                 dn[x - 1] += qerror*0.1875f;
---
>                 dn[x-1] += qerror*0.1875f;
608c605
<             if (x != src_width - 1)
---
>             if (x != src_width-1)
610,611c607,608
<                 dc[x + 1] += qerror*0.4375f;
<                 dn[x + 1] += qerror*0.0625f;
---
>                 dc[x+1] += qerror*0.4375f;
>                 dn[x+1] += qerror*0.0625f;
623,753d619
< void dither1_C(const float *p, unsigned char *dst, const int src_height,
<     const int src_width, const int dst_pitch, const int width, const int mode)
< {
<     float *dither = (float*)malloc(2 * width * sizeof(float));
<     float *dc = dither;
<     float *dn = dither + width;    //__declspec(align(16))
<     const float scale = (mode - 1) + 0.5f;
<     const float off = scale*0.5f;
<     memset(dc, 0, width * sizeof(float));
<     for (int y = 0; y<src_height; ++y)
<     {
<         memset(dn, 0, width * sizeof(float));
<         /*
<         //orig code
<         for (int x = 0; x < src_width; ++x)
<         {
<             const int v =(int)(p[x]+dc[x]+0.5f);
<             dst[x] = min(max(v, 0), 255);
<             const float qerror = p[x] - dst[x];
<             if (x != 0)
<             dn[x - 1] += qerror*0.1875f;
<             dn[x] += qerror*0.3125f;
<             if (x != src_width - 1)
<             {
<                 dc[x + 1] += qerror*0.4375f;
<                 dn[x + 1] += qerror*0.0625f;
<             }
<         }
<         */
<         //opt code
<         {
<             int x;
<             short vtmp[4];
<
<             float qerror[4];
<             float vtmp2[4];
<             const float floydst[4] = { 7.0 / 16, 3.0 / 16, 5.0 / 16, 1.0 / 16 };
<
<
<             {
<                 x = 0;
<                 dst[x] = min(max((short)(p[x] + dc[x] + 0.5f), 0), 255);
<                 qerror[0] = p[x] - dst[x];
<                 dc[x + 1] += qerror[0] * floydst[0];
<                 dn[x]     += qerror[0] * floydst[2];
<                 dn[x + 1] += qerror[0] * floydst[3];
<             }
<
<             for (x = 1; x < src_width - 4; x += 4)
<             {
<                 //float vtmp2[] = { p[x] + dc[x] + 0.5f ,p[x + 1] + dc[x+1] + 0.5f ,p[x + 2] + dc[x+2] + 0.5f ,p[x + 3] + dc[x+3] + 0.5f };
<                 for (int ii = 0; ii < 4; ii++) {vtmp2[ii] = p[x+ii] + dc[x+ii] + 0.5f;}
<
<                 vtmp[0] = min(max((short)vtmp2[0], 0), 255);    //new pixel
<                 qerror[0] = p[x] - vtmp[0];                //quant error
<
<                 vtmp2[1] += qerror[0] * floydst[0];
<                 vtmp[1] = min(max((short)vtmp2[1], 0), 255);
<                 qerror[1] = p[x + 1] - vtmp[1];
<
<                 vtmp2[2] += qerror[1] * floydst[0];
<                 vtmp[2] = min(max((short)vtmp2[2], 0), 255);
<                 qerror[2] = p[x + 2] - vtmp[2];
<
<                 vtmp2[3] += qerror[2] * floydst[0];
<                 vtmp[3] = min(max((short)vtmp2[3], 0), 255);
<                 qerror[3] = p[x + 3] - vtmp[3];
<
<                 dc[x + 1] += qerror[0] * floydst[0];        //current
<                 dc[x + 2] += qerror[1] * floydst[0];
<                 dc[x + 3] += qerror[2] * floydst[0];
<                 dc[x + 4] += qerror[3] * floydst[0];        //next loop first
<
<                 dst[x] = (unsigned char)vtmp[0];                    //new pixel
<                 dst[x + 1] = (unsigned char)vtmp[1];
<                 dst[x + 2] = (unsigned char)vtmp[2];
<                 dst[x + 3] = (unsigned char)vtmp[3];
<
<                 //                for (int ii = 0; ii < 4; ii++) {
<                 //                    dn[x + ii - 1] += qerror[ii] * floydst[1];
<                 //                    dn[x + ii]     += qerror[ii] * floydst[2];
<                 //                    dn[x + ii + 1] += qerror[ii] * floydst[3];
<                 //                }
<
<                 dn[x - 1] += qerror[0] * floydst[1];
<
<                 dn[x] += qerror[0] * floydst[2];
<                 dn[x] += qerror[1] * floydst[1];
<
<                 dn[x + 1] += qerror[0] * floydst[3];
<                 dn[x + 1] += qerror[1] * floydst[2];
<                 dn[x + 1] += qerror[2] * floydst[1];
<
<                 dn[x + 2] += qerror[1] * floydst[3];
<                 dn[x + 2] += qerror[2] * floydst[2];
<                 dn[x + 2] += qerror[3] * floydst[1];
<
<                 dn[x + 3] += qerror[2] * floydst[3];
<                 dn[x + 3] += qerror[3] * floydst[2];
<
<                 dn[x + 4] += qerror[3] * floydst[3];
<
<
<
<             }
<
<             for (; x < src_width; ++x)
<             {
<                 dst[x] = min(max((short)(p[x] + dc[x] + 0.5f), 0), 255);
<                 qerror[0] = p[x] - dst[x];
<                 dn[x - 1] += qerror[0] * 0.1875f;
<                 dn[x] += qerror[0] * floydst[2];
<                 if (x != src_width - 1)
<                 {
<                     dc[x + 1] += qerror[0] * floydst[0];
<                     dn[x + 1] += qerror[0] * floydst[3];
<                 }
<             }
<         }
<         //opt code end
<
<
<         p += width;
<         dst += dst_pitch;
<         float *tn = dn;
<         dn = dc;
<         dc = tn;
<     }
<     free(dither);
< }
<
1022,1025c888,889
<         if (dither == 1)
<             dither1_C(ebp,dstp,src_height,src_width,dst_pitch,width,dither);
<         else if (dither)
<             dither_C(ebp, dstp, src_height, src_width, dst_pitch, width, dither);
---
>         if (dither)
>             dither_C(ebp,dstp,src_height,src_width,dst_pitch,width,dither);
1089d952
<     /*
1097,1104d959
<     */
<     // for compiler vectorize ただしそのままSSEに持っていくとorigコードと結果がかなり変わるので注意
<     for (int h = 0; h<ccnt; h++)
<     {
<         dftc2[h] = gf*dftgc[h];
<         dftc[h] -= dftc2[h];
<     }
<
1122,1130c977,985
<         movaps xmm0,[esi+eax]        //xmm0=[dftgc+h]
<         movaps xmm1,[edx+eax]        //xmm1=[dftc +h]
<         mulps xmm0,xmm7                //xmm0=[dftgc+h]*gf
<         subps xmm1,xmm0                //xmm1=[dftc +h] - [dftgc+h]*gf
<         movaps [edi+eax],xmm0        //[dftc2 +h] = [dftgc+h]*gf
<         movaps [edx+eax],xmm1        //[dftc +h] = [dftc +h] - [dftgc+h]*gf
<         add eax, 16
<         sub ecx,4
<         jg four_loop
---
>         movaps xmm0,[esi+eax*4]
>         mulps xmm0,xmm7
>         movaps xmm1,[edx+eax*4]
>         subps xmm1,xmm0
>         movaps [edx+eax*4],xmm1
>         movaps [edi+eax*4],xmm0
>         add eax,4
>         cmp eax,ecx
>         jl four_loop
1136d990
<     /*
1142,1146d995
<     */
<     for (int h = 0; h<ccnt; h ++)
<     {
<         dftc[h] += dftc2[h];
<     }
1178c1027
<         const float psd = dftc[h+0]*dftc[h+0]+dftc[h+1]*dftc[h+1];        //psd=dftc[h]^2+dftc[h+1]^2
---
>         const float psd = dftc[h+0]*dftc[h+0]+dftc[h+1]*dftc[h+1];
1197,1214c1046,1063
<         movaps xmm2,[edi+eax]        //xmm2=dftc[h+ 3,2,1,0]←レジスタの内容配列だと逆向きなので注意
<         movaps xmm1,xmm2            //xmm1=dftc[h+ 3,2,1,0]
<         mulps xmm2,xmm2                //xmm2=dftc[h+ 3,2,1,0].^2
<         movaps xmm3,xmm2            //xmm3=dftc[h+ 3,2,1,0].^2
<         shufps xmm3,xmm3,177        //xmm3=dftc[h+ 2,3,0,1].^2
<         addps xmm3,xmm2                //xmm3=dftc[h]^2+dftc[h+1]^2 で32x4bit psd
<         movaps xmm2,xmm3            //xmm2=dftc[h]^2+dftc[h+1]^2
<         subps xmm3,[edx+eax]        //xmm3=dftc[h]^2+dftc[h+1]^2 - simgas[h]
<         addps xmm2,xmm7                //xmm2=dftc[h]^2+dftc[h]^2+1e-15
<         //divps xmm3,xmm2            //divにしてもメモリアクセスの遅さに比べれば十分早い
<         rcpps xmm2,xmm2                //xmm2=1/xmm2 逆数命令は精度が悪いので注意
<         mulps xmm3,xmm2                //xmm3=(psd-sigmas[h])/(psd+1e-15f)
<         maxps xmm3,xmm5                //max(xmm3,0)
<         mulps xmm1,xmm3                //xmm1*=xmm3
<         movaps [edi+eax],xmm1
<         add eax,16
<         sub ecx,4
<         jg four_loop
---
>         movaps xmm1,[edi+eax*4]
>         movaps xmm2,xmm1
>         mulps xmm2,xmm2
>         movaps xmm3,xmm2
>         shufps xmm3,xmm3,177
>         addps xmm3,xmm2
>         movaps xmm2,xmm3
>         subps xmm3,[edx+eax*4]
>         addps xmm2,xmm7
>         //divps xmm3,xmm2
>         rcpps xmm2,xmm2
>         mulps xmm3,xmm2
>         maxps xmm3,xmm5
>         mulps xmm1,xmm3
>         movaps [edi+eax*4],xmm1
>         add eax,4
>         cmp eax,ecx
>         jl four_loop

つれづれ～努力と根性？～

2017/11/11

いろいろ新しくしたら速くなってきた

0 件のコメント:

コメントを投稿