shssoichiro

4x4 kernel before

Oct 26th, 2022
1,954
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. .section .text.rav1e::asm::x86::dist::hbd::satd_kernel_4x4_hbd_avx2,"ax",@progbits
  2.     .p2align    4, 0x90
  3.     .type   rav1e::asm::x86::dist::hbd::satd_kernel_4x4_hbd_avx2,@function
  4. rav1e::asm::x86::dist::hbd::satd_kernel_4x4_hbd_avx2:
  5.  
  6.     .cfi_startproc
  7.     push rbp
  8.     .cfi_def_cfa_offset 16
  9.     push r15
  10.     .cfi_def_cfa_offset 24
  11.     push r14
  12.     .cfi_def_cfa_offset 32
  13.     push r13
  14.     .cfi_def_cfa_offset 40
  15.     push r12
  16.     .cfi_def_cfa_offset 48
  17.     push rbx
  18.     .cfi_def_cfa_offset 56
  19.     sub rsp, 24
  20.     .cfi_def_cfa_offset 80
  21.     .cfi_offset rbx, -56
  22.     .cfi_offset r12, -48
  23.     .cfi_offset r13, -40
  24.     .cfi_offset r14, -32
  25.     .cfi_offset r15, -24
  26.     .cfi_offset rbp, -16
  27.  
  28.     movzx r11d, word ptr [rdi]
  29.  
  30.     movzx ebx, word ptr [rdi + 2]
  31.     movzx ebp, word ptr [rdi + 4]
  32.     movzx eax, word ptr [rdi + 6]
  33.  
  34.     movzx r8d, word ptr [rdx]
  35.  
  36.     movzx r9d, word ptr [rdx + 2]
  37.     movzx r10d, word ptr [rdx + 4]
  38.  
  39.     vmovd xmm0, r11d
  40.  
  41.     vmovd xmm1, r8d
  42.  
  43.     movzx r11d, word ptr [rdi + 2*rsi]
  44.  
  45.     movzx r8d, word ptr [rdx + 2*rcx]
  46.  
  47.     vpinsrd xmm0, xmm0, ebx, 1
  48.  
  49.     vpinsrd xmm1, xmm1, r9d, 1
  50.  
  51.     movzx ebx, word ptr [rdi + 2*rsi + 2]
  52.  
  53.     movzx r9d, word ptr [rdx + 2*rcx + 2]
  54.  
  55.     vpinsrd xmm0, xmm0, ebp, 2
  56.  
  57.     vpinsrd xmm1, xmm1, r10d, 2
  58.  
  59.     movzx ebp, word ptr [rdi + 2*rsi + 4]
  60.  
  61.     movzx r10d, word ptr [rdx + 2*rcx + 4]
  62.  
  63.     vpinsrd xmm0, xmm0, eax, 3
  64.  
  65.     movzx eax, word ptr [rdx + 6]
  66.  
  67.     vmovd xmm2, r8d
  68.  
  69.     movzx r8d, word ptr [rdx + 4*rcx]
  70.  
  71.     vpinsrd xmm2, xmm2, r9d, 1
  72.  
  73.     movzx r9d, word ptr [rdx + 4*rcx + 2]
  74.  
  75.     vpinsrd xmm2, xmm2, r10d, 2
  76.  
  77.     movzx r10d, word ptr [rdx + 4*rcx + 4]
  78.  
  79.     vpinsrd xmm1, xmm1, eax, 3
  80.  
  81.     movzx eax, word ptr [rdi + 2*rsi + 6]
  82.  
  83.     vpsubd xmm0, xmm0, xmm1
  84.  
  85.     vmovd xmm1, r11d
  86.  
  87.     movzx r11d, word ptr [rdi + 4*rsi]
  88.  
  89.     vpextrd r14d, xmm0, 1
  90.  
  91.     vmovd xmm3, r8d
  92.  
  93.     vpinsrd xmm1, xmm1, ebx, 1
  94.  
  95.     movzx ebx, word ptr [rdi + 4*rsi + 2]
  96.  
  97.     vpinsrd xmm1, xmm1, ebp, 2
  98.  
  99.     movzx ebp, word ptr [rdi + 4*rsi + 4]
  100.  
  101.     vpinsrd xmm3, xmm3, r9d, 1
  102.     vpinsrd xmm3, xmm3, r10d, 2
  103.  
  104.     vpinsrd xmm1, xmm1, eax, 3
  105.  
  106.     movzx eax, word ptr [rdx + 2*rcx + 6]
  107.  
  108.     vpinsrd xmm2, xmm2, eax, 3
  109.  
  110.     movzx eax, word ptr [rdi + 4*rsi + 6]
  111.  
  112.     vpsubd xmm1, xmm1, xmm2
  113.  
  114.     vmovd xmm2, r11d
  115.     vpinsrd xmm2, xmm2, ebx, 1
  116.     vpinsrd xmm2, xmm2, ebp, 2
  117.     vpinsrd xmm2, xmm2, eax, 3
  118.  
  119.     movzx eax, word ptr [rdx + 4*rcx + 6]
  120.  
  121.     lea rcx, [rcx + 2*rcx]
  122.  
  123.     movzx r8d, word ptr [rdx + 2*rcx]
  124.     movzx r9d, word ptr [rdx + 2*rcx + 2]
  125.     movzx r10d, word ptr [rdx + 2*rcx + 4]
  126.  
  127.     vpinsrd xmm3, xmm3, eax, 3
  128.  
  129.     lea rax, [rsi + 2*rsi]
  130.  
  131.     movzx esi, word ptr [rdi + 2*rax]
  132.  
  133.     movzx ebx, word ptr [rdi + 2*rax + 2]
  134.     movzx ebp, word ptr [rdi + 2*rax + 4]
  135.     movzx eax, word ptr [rdi + 2*rax + 6]
  136.  
  137.     vpsubd xmm2, xmm2, xmm3
  138.  
  139.     vmovd xmm4, r8d
  140.  
  141.     vpextrd edi, xmm2, 2
  142.  
  143.     vpinsrd xmm4, xmm4, r9d, 1
  144.  
  145.     vmovd r9d, xmm0
  146.  
  147.     vpinsrd xmm4, xmm4, r10d, 2
  148.  
  149.     vmovd xmm3, esi
  150.     vpinsrd xmm3, xmm3, ebx, 1
  151.     vpinsrd xmm3, xmm3, ebp, 2
  152.     vpinsrd xmm3, xmm3, eax, 3
  153.  
  154.     movzx eax, word ptr [rdx + 2*rcx + 6]
  155.  
  156.     vpinsrd xmm4, xmm4, eax, 3
  157.  
  158.     vmovd eax, xmm1
  159.  
  160.     vpsubd xmm3, xmm3, xmm4
  161.  
  162.     lea r12d, [rax + r9]
  163.  
  164.     sub r9d, eax
  165.  
  166.     vmovd eax, xmm2
  167.  
  168.     vmovd edx, xmm3
  169.  
  170.     vpextrd r8d, xmm3, 3
  171.  
  172.     lea esi, [rdx + rax]
  173.  
  174.     sub eax, edx
  175.  
  176.     vpextrd edx, xmm3, 1
  177.  
  178.     lea r15d, [rax + r9]
  179.     sub r9d, eax
  180.  
  181.     vpextrd eax, xmm1, 1
  182.  
  183.     lea r11d, [rsi + r12]
  184.  
  185.     sub r12d, esi
  186.  
  187.     lea ecx, [rax + r14]
  188.  
  189.     sub r14d, eax
  190.  
  191.     vpextrd eax, xmm2, 1
  192.  
  193.     lea esi, [rdx + rax]
  194.  
  195.     sub eax, edx
  196.  
  197.     lea edx, [rsi + rcx]
  198.  
  199.     sub ecx, esi
  200.  
  201.     vpextrd esi, xmm0, 2
  202.  
  203.     mov qword ptr [rsp + 16], rcx
  204.  
  205.     lea ecx, [rax + r14]
  206.  
  207.     sub r14d, eax
  208.  
  209.     vpextrd eax, xmm1, 2
  210.  
  211.     mov qword ptr [rsp], rdx
  212.     mov qword ptr [rsp + 8], rcx
  213.  
  214.     lea r10d, [rax + rsi]
  215.  
  216.     sub esi, eax
  217.  
  218.     vpextrd eax, xmm3, 2
  219.  
  220.     lea ebp, [rax + rdi]
  221.  
  222.     sub edi, eax
  223.  
  224.     lea ebx, [rbp + r10]
  225.  
  226.     sub r10d, ebp
  227.  
  228.     lea edx, [rdi + rsi]
  229.     sub esi, edi
  230.  
  231.     vpextrd edi, xmm0, 3
  232.  
  233.     vpextrd ebp, xmm1, 3
  234.  
  235.     lea r13d, [rbp + rdi]
  236.  
  237.     sub edi, ebp
  238.  
  239.     vpextrd ebp, xmm2, 3
  240.  
  241.     lea eax, [r8 + rbp]
  242.  
  243.     sub ebp, r8d
  244.  
  245.     lea ecx, [rax + r13]
  246.  
  247.     sub r13d, eax
  248.  
  249.     mov rax, qword ptr [rsp]
  250.  
  251.     lea r8d, [rbp + rdi]
  252.     sub edi, ebp
  253.  
  254.     lea ebp, [rax + r11]
  255.  
  256.     sub r11d, eax
  257.  
  258.     lea eax, [rcx + rbx]
  259.  
  260.     sub ebx, ecx
  261.  
  262.     lea ecx, [rax + rbp]
  263.  
  264.     sub ebp, eax
  265.  
  266.     mov rax, qword ptr [rsp + 8]
  267.  
  268.     mov dword ptr [rsp], ecx
  269.  
  270.     lea ecx, [rbx + r11]
  271.  
  272.     sub r11d, ebx
  273.  
  274.     vmovd xmm1, dword ptr [rsp]
  275.  
  276.     lea ebx, [rax + r15]
  277.  
  278.     sub r15d, eax
  279.  
  280.     lea eax, [r8 + rdx]
  281.  
  282.     sub edx, r8d
  283.  
  284.     lea r8d, [rax + rbx]
  285.  
  286.     sub ebx, eax
  287.  
  288.     lea eax, [rdx + r15]
  289.  
  290.     sub r15d, edx
  291.  
  292.     lea edx, [r14 + r9]
  293.  
  294.     sub r9d, r14d
  295.  
  296.     vpinsrd xmm1, xmm1, ecx, 1
  297.     mov rcx, qword ptr [rsp + 16]
  298.     vmovd xmm0, r8d
  299.     vpinsrd xmm1, xmm1, ebp, 2
  300.     vpinsrd xmm1, xmm1, r11d, 3
  301.     vpinsrd xmm0, xmm0, eax, 1
  302.     vpinsrd xmm0, xmm0, ebx, 2
  303.  
  304.     lea ebx, [rdi + rsi]
  305.  
  306.     sub esi, edi
  307.  
  308.     lea edi, [rbx + rdx]
  309.  
  310.     sub edx, ebx
  311.  
  312.     lea ebx, [rsi + r9]
  313.  
  314.     vpinsrd xmm0, xmm0, r15d, 3
  315.  
  316.     sub r9d, esi
  317.  
  318.     vmovd xmm2, edi
  319.     vinserti128 ymm0, ymm1, xmm0, 1
  320.  
  321.     lea eax, [rcx + r12]
  322.  
  323.     sub r12d, ecx
  324.  
  325.     lea ecx, [r13 + r10]
  326.  
  327.     sub r10d, r13d
  328.  
  329.     vpabsd ymm0, ymm0
  330.  
  331.     lea ebp, [rcx + rax]
  332.  
  333.     sub eax, ecx
  334.  
  335.     lea ecx, [r10 + r12]
  336.  
  337.     vpinsrd xmm2, xmm2, ebx, 1
  338.  
  339.     sub r12d, r10d
  340.  
  341.     vmovd xmm3, ebp
  342.     vpinsrd xmm2, xmm2, edx, 2
  343.     vpinsrd xmm2, xmm2, r9d, 3
  344.     vpinsrd xmm3, xmm3, ecx, 1
  345.     vpinsrd xmm1, xmm3, eax, 2
  346.     vpinsrd xmm1, xmm1, r12d, 3
  347.     vinserti128 ymm1, ymm1, xmm2, 1
  348.  
  349.     vpabsd ymm1, ymm1
  350.  
  351.     vpaddd ymm0, ymm1, ymm0
  352.  
  353.     vmovd eax, xmm0
  354.  
  355.     vpextrd ecx, xmm0, 1
  356.  
  357.     add rcx, rax
  358.  
  359.     vpextrd eax, xmm0, 2
  360.  
  361.     add rax, rcx
  362.  
  363.     vpextrd ecx, xmm0, 3
  364.  
  365.     vextracti128 xmm0, ymm0, 1
  366.  
  367.     add rcx, rax
  368.  
  369.     vmovd eax, xmm0
  370.     vpextrd edx, xmm0, 2
  371.  
  372.     add rax, rcx
  373.  
  374.     vpextrd ecx, xmm0, 1
  375.  
  376.     add rcx, rax
  377.  
  378.     vpextrd eax, xmm0, 3
  379.  
  380.     add rdx, rcx
  381.  
  382.     add rax, rdx
  383.  
  384.     add rsp, 24
  385.     .cfi_def_cfa_offset 56
  386.     pop rbx
  387.     .cfi_def_cfa_offset 48
  388.     pop r12
  389.  
  390.     .cfi_def_cfa_offset 40
  391.     pop r13
  392.  
  393.     .cfi_def_cfa_offset 32
  394.     pop r14
  395.  
  396.     .cfi_def_cfa_offset 24
  397.     pop r15
  398.     .cfi_def_cfa_offset 16
  399.     pop rbp
  400.  
  401.     .cfi_def_cfa_offset 8
  402.     vzeroupper
  403.  
  404.     ret
  405.  
  406. .Lfunc_end666:
  407.     .size   rav1e::asm::x86::dist::hbd::satd_kernel_4x4_hbd_avx2, .Lfunc_end666-rav1e::asm::x86::dist::hbd::satd_kernel_4x4_hbd_avx2
  408.  
Advertisement