#include "edge_detect.h"
void mysobel(rgb_img_t &src, rgb_img_t &dst, int dir)
int const rows = MAX_HEIGHT;
int const cols = MAX_WIDTH;
rgb_img_t img0(rows, cols);
if (dir)
hls::Sobel<1,0,3>(src, img0);
hls::Sobel<0,1,3>(src, img0);
hls::ConvertScaleAbs(img0, dst);
void mysobelxy(rgb_img_t &src, rgb_img_t &dst)
int const rows = MAX_HEIGHT;
int const cols = MAX_WIDTH;
rgb_img_t img0(rows, cols);
rgb_img_t img1(rows, cols);
rgb_img_t img2(rows, cols);
rgb_img_t img3(rows, cols);
hls::Duplicate(src, img0, img1);
mysobel(img0, img2, 1);
mysobel(img1, img3, 0);
hls::AddWeighted(img2, 1, img3, 1, 0, dst);
void sobelfoo(stream_t &stream_in, stream_t &stream_out)
int const rows = MAX_HEIGHT;
int const cols = MAX_WIDTH;
rgb_img_t img0(rows, cols);
rgb_img_t img1(rows, cols);
rgb_img_t img2(rows, cols);
rgb_img_t img4(rows, cols);
hls::AXIvideo2Mat(stream_in, img0);
hls::CvtColor<HLS_RGB2GRAY>(img0, img1);
hls::CvtColor<HLS_GRAY2RGB>(img2, img4);
hls::Mat2AXIvideo(img4, stream_out);
//void blurfoo(stream_t &stream_in, stream_t &stream_out)
// int const rows = MAX_HEIGHT;
// int const cols = MAX_WIDTH;
// rgb_img_t img0(rows, cols);
// rgb_img_t img1(rows, cols);
// hls::AXIvideo2Mat(stream_in, img0);
// hls::GaussianBlur<5,5>(img0, img1, (double)5, (double)5);
// hls::Mat2AXIvideo(img1, stream_out);
void edge_detect(stream_t &stream_in, stream_t &stream_out)
int const rows = MAX_HEIGHT;
int const cols = MAX_WIDTH;
sobelfoo(stream_in, stream_out);

#include "hls_video.h"
typedef ap_axiu<24,1,1,1> interface_t;
typedef ap_uint<3> interface_3_bits;
typedef hls::stream<interface_t> stream_t;
void edge_detect(stream_t &stream_in, stream_t &stream_out);
#define MAX_WIDTH 1280
#define MAX_HEIGHT 720
typedef hls::Mat<MAX_HEIGHT, MAX_WIDTH, HLS_8UC3> rgb_img_t;
typedef hls::Scalar<3, unsigned char> rgb_pix_t;
#define INPUT_IMAGE "rover.bmp"
#define OUTPUT_IMAGE "rover_out.bmp"

import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import numpy as np
Y = [661, 971, 1282, 1613, 1936, 2273, 2609, 2933, 3264]
Y2 = [205, 290, 395, 478, 562, 625, 718, 808, 899]
Y_ARM = [424, 824 , 1224 , 1624, 2024, 2424 , 2824, 3224, 3624]
Y_gcc = [2296, 5119, 6715, 9078, 10830, 12541, 15041, 16780, 18883]
Y_O3 = [165, 287, 380, 559, 900, 901, 776, 1122, 1574]
X = []
temps = []
for i in range(9) :
Y[i] = Y[i]*3/100
Y_ARM[i] = Y_ARM[i]*3/100
Y_gcc[i] = Y_gcc[i]*3/100/4
Y_O3[i] = Y_O3[i]*3/100/4
plt.scatter(X, Y, color="b", marker="x", label="FPGA 50 MHz")
plt.scatter(X, Y2, color="g", marker="x", label="FPGA 200 MHz")
plt.scatter(X, Y_ARM, color="g", label="mesures ARM")
plt.scatter(X, Y_gcc, color="y",marker="*", label="desktop non opti")
plt.scatter(X, Y_O3, color="pink", marker="*",label="desktop_opti")
x = np.array(X).reshape(-1, 1)
y = np.array(Y).reshape(-1, 1)
y2 = np.array(Y2).reshape(-1, 1)
reg = LinearRegression().fit(x, y)
reg2 = LinearRegression().fit(x, y2)
print("score obtenu : " + str(reg.score(x, y)))
print("score obtenu : " + str(reg.score(x, y2)))
print("attente à zéro : {}".format(reg.intercept_))
print("attente à zéro : {}".format(reg2.intercept_))
x_lin = [0, max(X)]
y_lin = [reg.predict(np.array([0]).reshape(-1, 1)), reg.predict(np.array([x_lin[1]]).reshape(-1, 1))]
y_lin2 = [reg2.predict(np.array([0]).reshape(-1, 1)), reg2.predict(np.array([x_lin[1]]).reshape(-1, 1))]
y_lin = [y_lin[0][0][0], y_lin[1][0][0]]
y_lin2 = [y_lin2[0][0][0], y_lin2[1][0][0]]
plt.plot(x_lin, y_lin, color = "r", label="RegLin 50 score : {:.4f}".format(reg.score(x, y)))
plt.plot(x_lin, y_lin2, color = "r")
plt.xlim([0, 500])
plt.ylim([0, 100])
plt.title("Temps d'exécution en fonction de n_max")
plt.ylabel("T (0.1 µs)")

library ieee;
use ieee.std_logic_1164.all;
use ieee.numeric_std.all;
entity add_sub is
generic(nb_bits : natural:=16);
port( A,B : in STD_LOGIC_VECTOR(nb_bits-1 downto 0);
Op : in STD_LOGIC; -- operation choice: '0' --> addition, '1' subtraction
S : out STD_LOGIC_VECTOR(nb_bits-1 downto 0); -- result
Cout : out STD_LOGIC); -- carry out
end add_sub;
architecture proced of add_sub is
signal Aint, Bint : unsigned(nb_bits downto 0); -- we add one bit to make error free computations
signal Sint : unsigned(nb_bits downto 0);
Aint(nb_bits-1 downto 0) <= unsigned(A);
Aint(nb_bits) <= '0';
Bint(nb_bits-1 downto 0) <= unsigned(B);
Bint(nb_bits) <= '0';
process(Op, Aint, Bint)
if (Op = '0') then
Sint <= Aint + Bint;
Sint <= Aint - Bint;
end if;
end process;
S <= std_logic_vector(Sint(nb_bits-1 downto 0));
Cout <= Sint(nb_bits);
end proced;

library ieee;
use ieee.std_logic_1164.all;
use ieee.numeric_std.all;
entity decounter is
generic(nb_bits : natural ; -- width of the decounter
nb_iter : natural ); -- number of iterations to be performed
port( Init : in STD_LOGIC; -- Initialization command for the decounter; active on '1'
encount : in STD_LOGIC; -- enable input for the decounter; active on '1'
clk : in STD_LOGIC; -- clock
ceqz : out STD_LOGIC); -- output indicating if decounter=0 when it is at '1'
end decounter;
architecture proced of decounter is
signal Sint : unsigned(nb_bits-1 downto 0);
signal ceqzint : std_logic;
Ps: process(clk, Init) -- process to compute the decounter's value
if (Init = '1') then
Sint <= TO_UNSIGNED(nb_iter-1,nb_bits); -- use of a conversion function from numeric_std library
elsif (clk'event and (clk = '1') and (encount = '1')) then
Sint <= Sint - 1;
end if;
end process;
Pc : process(Sint) -- combinatorial process to compute the ceqz output value
if Sint > 0 then
ceqzint <= '0';
ceqzint <= '1';
end if;
end process;
ceqz <= ceqzint;
end proced;

library ieee;
use ieee.std_logic_1164.all;
use ieee.numeric_std.all;
entity mux3_1 is
generic(nb_bits : natural );
port( I0,I1,I2 : in STD_LOGIC_VECTOR(nb_bits-1 downto 0);
sel : in STD_LOGIC_VECTOR(1 downto 0); -- select input
S : out STD_LOGIC_VECTOR(nb_bits-1 downto 0));
end mux3_1;
architecture proced of mux3_1 is
signal Sint : STD_LOGIC_VECTOR(nb_bits-1 downto 0);-- internal signal since process cannot directly modify an output
process(sel, I0, I1)
if (sel = "00") then
Sint <= I0;
elsif (sel = "01") then
Sint <= I1;
Sint <= I2;
end if;
end process;
S <= Sint;
end proced;

library ieee;
use ieee.std_logic_1164.all;
use ieee.numeric_std.all;
entity reg is
generic(nb_bits : natural);
port( Init : in STD_LOGIC; -- Initialization command
init_value : in STD_LOGIC_VECTOR(nb_bits-1 downto 0); -- Init value with unsigned type to cover all possible values with large nb_bits
ld : in STD_LOGIC; -- load command
clk : in STD_LOGIC;
E : in STD_LOGIC_VECTOR(nb_bits-1 downto 0);
S : out STD_LOGIC_VECTOR(nb_bits-1 downto 0));
end reg;
architecture proced of reg is
signal Sint : unsigned(nb_bits-1 downto 0);
Ps: process(clk, Init)
if (Init = '1') then
Sint <= unsigned(init_value);
elsif (clk'event and (clk = '1') and (ld ='1')) then
Sint <= unsigned(E);
end if;
end process;
S <= std_logic_vector(Sint);
end proced;

library ieee;
use ieee.std_logic_1164.all;
use ieee.numeric_std.all;
entity testbench is
end testbench;
architecture test_decounter of testbench is
signal clk, sig_init, sig_eqz, sig_en : std_logic;
sig_init <= '1', '0' after 20 ns;
sig_en <= '0', '1' after 90 ns;
DUT: entity work.decounter(proced)
generic map(nb_bits => 4, nb_iter => 16)
port map(encount => sig_en, clk => clk, init => sig_init, ceqz => sig_eqz);
Gene_clk: process
clk <= '0';
wait for 10 ns;
for i in 1 to 30 loop
clk <= '1';
wait for 10 ns;
clk <= '0';
wait for 10 ns;
end loop;
end process;
end test_decounter;
architecture test_mux2_1 of testbench is
signal sig_sel : std_logic;
signal sig_A, sig_B, sig_S : std_logic_vector(15 downto 0);
sig_sel <= '1', '0' after 50 ns;
sig_A <= std_logic_vector(to_unsigned(100,16));
sig_B <= std_logic_vector(to_unsigned(0,16));
DUT: entity work.mux2_1(proced)
generic map(nb_bits => 16)
port map(I0 => sig_A, I1 => sig_B, sel => sig_sel, S => sig_S);
end test_mux2_1;
architecture test_add_sub of testbench is
signal sig_op, sig_cout : std_logic;
signal sig_A, sig_B, sig_res : std_logic_vector(15 downto 0);
sig_op <= '0', '1' after 100 ns, '0' after 200 ns;
sig_A <= std_logic_vector(to_unsigned(1,16)), std_logic_vector(to_unsigned(0,16)) after 50 ns;
sig_B <= std_logic_vector(to_unsigned(1,16));
DUT: entity work.add_sub(proced)
generic map(nb_bits => 16)
port map(A => sig_A, B => sig_B, op => sig_op, S => sig_res, cout => sig_cout);
end test_add_sub;
architecture test_mux3_1 of testbench is
signal sig_sel : std_logic_vector(1 downto 0);
signal sig_A, sig_B, sig_C, sig_S : std_logic_vector(15 downto 0);
sig_sel <= "00", "01" after 50 ns, "10" after 100 ns, "11" after 150 ns, "00" after 200 ns;
sig_A <= std_logic_vector(to_unsigned(100,16));
sig_B <= std_logic_vector(to_unsigned(0,16));
sig_C <= std_logic_vector(to_unsigned(32,16));
DUT: entity work.mux3_1(proced)
generic map(nb_bits => 16)
port map(I0 => sig_A, I1 => sig_B, I2 => sig_C, sel => sig_sel, S => sig_S);
end test_mux3_1;

library ieee;
use ieee.std_logic_1164.all;
use ieee.numeric_std.all;
entity racine_machine is
port (
clock : IN std_logic;
START : IN std_logic;
RESET : IN std_logic;
m_in : IN std_logic_vector(3 downto 0);
m_out : OUT std_logic_vector(3 downto 0)
-- count : OUT std_logic_vector(3 downto 0)
) ;
end racine_machine;
architecture behavior of racine_machine is
type etat is (attente, init, calcul, fin);
type cal is ('0','1');
signal state : etat := attente;
signal done : cal;
Racine : process(clock)
variable X : integer;
variable V : integer;
variable Z : integer;
variable n : integer := 5;
variable cond : integer;
variable i : integer;
if(rising_edge(clock)) then
if RESET = '1' then
state <= attente;
case state is
when attente =>
if START = '1' then
state <= init;
state <= attente;
end if;
done <= '0';
when init =>
X := to_integer(unsigned(m_in));
V := 256;
Z := 0;
i := 5;
done <= '0';
state <= calcul;
when calcul =>
Z := Z+V;
cond := X-Z;
if cond >= 0 then
X := X-Z;
Z := (Z+V)/2;
Z := (Z-V)/2;
end if;
V := V/4;
i := i-1;
-- count <= std_logic_vector(to_unsigned(i, count'length));
done <= '0';
if i = 0 then
state <= fin;
state <= calcul;
end if;
when fin =>
done <= '1';
m_out <= std_logic_vector(to_unsigned(Z, m_out'length));
if START = '1' then
state <= fin;
state <= attente;
end if;
when others =>
state <= attente;
end case;
end if;
end if;
end process Racine;
end behavior;

library ieee;
use ieee.std_logic_1164.all;
use ieee.numeric_std.all;
entity machine is
port (
clock : IN std_logic;
START : IN std_logic;
RESET : IN std_logic;
INPUT : IN std_logic_vector(7 downto 0);
OUTPUT : OUT std_logic_vector(7 downto 0);
count : OUT std_logic_vector(7 downto 0)
) ;
end machine;
architecture behavior of machine is
type etat is (attente, init, calcul, fin);
type cal is ('0','1');
signal state : etat := attente;
signal done : cal;
Racine : process(clock)
variable X : integer;
variable V : integer;
variable Z : integer;
variable n : integer := 5;
variable cond : integer;
variable i : integer;
if(rising_edge(clock)) then
if RESET = '1' then
state <= attente;
end if;
case state is
when attente =>
if START = '1' then
state <= init;
state <= attente;
end if;
done <= '0';
when init =>
X := to_integer(unsigned(INPUT));
V := 256;
Z := 0;
i := 5;
done <= '0';
state <= calcul;
when calcul =>
Z := Z+V;
cond := X-Z;
if cond >= 0 then
X := X-Z;
Z := (Z+V)/2;
Z := (Z-V)/2;
end if;
V := V/4;
i := i-1;
count <= std_logic_vector(to_unsigned(i, count'length));
done <= '0';
if i = 0 then
state <= fin;
state <= calcul;
end if;
when fin =>
done <= '1';
OUTPUT <= std_logic_vector(to_unsigned(Z, OUTPUT'length));
if START = '1' then
state <= fin;
state <= attente;
end if;
when others =>
state <= attente;
end case;
end if;
end process Racine;
end behavior;

C:/Users/sradosa/Documents/VHDL/RacineCarre/MachineEtat.vhd {1 {vcom -work work -2002 -explicit C:/Users/sradosa/Documents/VHDL/RacineCarre/MachineEtat.vhd
Model Technology ModelSim ALTERA vcom 10.1d Compiler 2012.11 Nov 2 2012
-- Loading package STANDARD
-- Loading package TEXTIO
-- Loading package std_logic_1164
-- Loading package NUMERIC_STD
-- Compiling entity machine
-- Compiling architecture behavior of machine
} {} {}}

#include <stdio.h>
void f(float * data, int n){
for(int i = 0; i<n; i++){
data[i] = 1.f/(data[i]*data[i]);
float fsum(float * data, int n){
float s = 0;
for(int i = 0; i<n; i++){
data[i] = 1.f/(data[i]*data[i]);
s += data[i];
return s;
int main(void){

.file "toto.c"
.globl f
.type f, @function
testl %esi, %esi
jle .L1
movq %rdi, %rax
leal -1(%rsi), %edx
leaq 4(%rdi,%rdx,4), %rdx
movss .LC0(%rip), %xmm1
movss (%rax), %xmm0
mulss %xmm0, %xmm0
movaps %xmm1, %xmm2
divss %xmm0, %xmm2
movss %xmm2, (%rax)
addq $4, %rax
cmpq %rdx, %rax
jne .L3
.size f, .-f
.globl fsum
.type fsum, @function
testl %esi, %esi
jle .L8
movq %rdi, %rax
leal -1(%rsi), %edx
leaq 4(%rdi,%rdx,4), %rdx
pxor %xmm1, %xmm1
movss .LC0(%rip), %xmm2
movss (%rax), %xmm0
mulss %xmm0, %xmm0
movaps %xmm2, %xmm3
divss %xmm0, %xmm3
movss %xmm3, (%rax)
addss %xmm3, %xmm1
addq $4, %rax
cmpq %rdx, %rax
jne .L7
movaps %xmm1, %xmm0
pxor %xmm1, %xmm1
jmp .L5
.size fsum, .-fsum
.globl main
.type main, @function
movl $0, %eax
.size main, .-main
.section .rodata.cst4,"aM",@progbits,4
.align 4
.long 1065353216
.ident "GCC: (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0"
.section .note.GNU-stack,"",@progbits
.align 8
.long 1f - 0f
.long 4f - 1f
.long 5
.string "GNU"
.align 8
.long 0xc0000002
.long 3f - 2f
.long 0x3
.align 8

.file "toto.c"
.p2align 4
.globl f
.type f, @function
testl %esi, %esi
jle .L1
leal -1(%rsi), %eax
cmpl $2, %eax
jbe .L8
movl %esi, %edx
movq %rdi, %rax
shrl $2, %edx
salq $4, %rdx
addq %rdi, %rdx
.p2align 4,,10
.p2align 3
movups (%rax), %xmm0
addq $16, %rax
mulps %xmm0, %xmm0
rcpps %xmm0, %xmm1
mulps %xmm1, %xmm0
mulps %xmm1, %xmm0
addps %xmm1, %xmm1
subps %xmm0, %xmm1
movups %xmm1, -16(%rax)
cmpq %rdx, %rax
jne .L4
movl %esi, %eax
andl $-4, %eax
testb $3, %sil
je .L11
movslq %eax, %rdx
leaq (%rdi,%rdx,4), %rdx
movss (%rdx), %xmm0
movaps %xmm0, %xmm1
mulss %xmm0, %xmm1
movss .LC1(%rip), %xmm0
movaps %xmm0, %xmm3
divss %xmm1, %xmm3
movss %xmm3, (%rdx)
leal 1(%rax), %edx
cmpl %edx, %esi
jle .L1
movslq %edx, %rdx
movaps %xmm0, %xmm4
addl $2, %eax
leaq (%rdi,%rdx,4), %rdx
movss (%rdx), %xmm1
mulss %xmm1, %xmm1
divss %xmm1, %xmm4
movss %xmm4, (%rdx)
cmpl %eax, %esi
jle .L1
leaq (%rdi,%rax,4), %rax
movss (%rax), %xmm1
mulss %xmm1, %xmm1
divss %xmm1, %xmm0
movss %xmm0, (%rax)
.p2align 4,,10
.p2align 3
xorl %eax, %eax
jmp .L3
.size f, .-f
.p2align 4
.globl fsum
.type fsum, @function
testl %esi, %esi
jle .L18
leal -1(%rsi), %eax
cmpl $2, %eax
jbe .L19
movl %esi, %edx
movq %rdi, %rax
pxor %xmm2, %xmm2
shrl $2, %edx
salq $4, %rdx
addq %rdi, %rdx
.p2align 4,,10
.p2align 3
movups (%rax), %xmm1
addq $16, %rax
mulps %xmm1, %xmm1
rcpps %xmm1, %xmm0
mulps %xmm0, %xmm1
mulps %xmm0, %xmm1
addps %xmm0, %xmm0
subps %xmm1, %xmm0
movups %xmm0, -16(%rax)
addps %xmm0, %xmm2
cmpq %rdx, %rax
jne .L15
movaps %xmm2, %xmm0
movl %esi, %eax
movhlps %xmm2, %xmm0
andl $-4, %eax
addps %xmm0, %xmm2
movaps %xmm2, %xmm0
shufps $85, %xmm2, %xmm0
addps %xmm0, %xmm2
movaps %xmm2, %xmm0
testb $3, %sil
je .L21
movslq %eax, %rdx
leaq (%rdi,%rdx,4), %rdx
movss (%rdx), %xmm1
movaps %xmm1, %xmm2
mulss %xmm1, %xmm2
movss .LC1(%rip), %xmm1
movaps %xmm1, %xmm4
divss %xmm2, %xmm4
movss %xmm4, (%rdx)
leal 1(%rax), %edx
addss %xmm4, %xmm0
cmpl %edx, %esi
jle .L12
movslq %edx, %rdx
movaps %xmm1, %xmm5
addl $2, %eax
leaq (%rdi,%rdx,4), %rdx
movss (%rdx), %xmm2
mulss %xmm2, %xmm2
divss %xmm2, %xmm5
addss %xmm5, %xmm0
movss %xmm5, (%rdx)
cmpl %eax, %esi
jle .L12
leaq (%rdi,%rax,4), %rax
movss (%rax), %xmm2
mulss %xmm2, %xmm2
divss %xmm2, %xmm1
addss %xmm1, %xmm0
movss %xmm1, (%rax)
.p2align 4,,10
.p2align 3
pxor %xmm0, %xmm0
.p2align 4,,10
.p2align 3
xorl %eax, %eax
pxor %xmm0, %xmm0
jmp .L14
.size fsum, .-fsum
.section .text.startup,"ax",@progbits
.p2align 4
.globl main
.type main, @function
xorl %eax, %eax
.size main, .-main
.section .rodata.cst4,"aM",@progbits,4
.align 4
.long 1065353216
.ident "GCC: (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0"
.section .note.GNU-stack,"",@progbits
.align 8
.long 1f - 0f
.long 4f - 1f
.long 5
.string "GNU"
.align 8
.long 0xc0000002
.long 3f - 2f
.long 0x3
.align 8

#include <stdio.h>
#include <math.h>
void f(float * x, float * y, float a, int n){
for(int i = 0; i<n; i++){
y[i] += a*x[i];
void f2(float * x, float * y, float a, int n){
for(int i = 0; i<n; i++){
y[i] += sqrt(a*x[i]);
int main(void){

.file "toto2.c"
.globl f
.type f, @function
testl %edx, %edx
jle .L1
leal -1(%rdx), %ecx
movl $0, %eax
movaps %xmm0, %xmm1
mulss (%rdi,%rax,4), %xmm1
addss (%rsi,%rax,4), %xmm1
movss %xmm1, (%rsi,%rax,4)
movq %rax, %rdx
addq $1, %rax
cmpq %rcx, %rdx
jne .L3
.size f, .-f
.globl f2
.type f2, @function
testl %edx, %edx
jle .L5
movq %rsi, %rax
leal -1(%rdx), %edx
leaq 4(%rsi,%rdx,4), %rdx
movaps %xmm0, %xmm1
mulss (%rdi), %xmm1
cvtss2sd %xmm1, %xmm1
movapd %xmm1, %xmm2
sqrtsd %xmm2, %xmm2
pxor %xmm1, %xmm1
cvtss2sd (%rax), %xmm1
addsd %xmm2, %xmm1
cvtsd2ss %xmm1, %xmm1
movss %xmm1, (%rax)
addq $4, %rax
addq $4, %rdi
cmpq %rdx, %rax
jne .L7
.size f2, .-f2
.globl main
.type main, @function
movl $0, %eax
.size main, .-main
.ident "GCC: (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0"
.section .note.GNU-stack,"",@progbits
.align 8
.long 1f - 0f
.long 4f - 1f
.long 5
.string "GNU"
.align 8
.long 0xc0000002
.long 3f - 2f
.long 0x3
.align 8

.file "toto2.c"
.p2align 4
.globl f
.type f, @function
testl %edx, %edx
jle .L1
leaq 15(%rsi), %rcx
leal -1(%rdx), %eax
subq %rdi, %rcx
cmpq $30, %rcx
jbe .L3
cmpl $2, %eax
jbe .L3
movl %edx, %ecx
movaps %xmm0, %xmm2
xorl %eax, %eax
shrl $2, %ecx
shufps $0, %xmm2, %xmm2
salq $4, %rcx
.p2align 4,,10
.p2align 3
movups (%rdi,%rax), %xmm1
movups (%rsi,%rax), %xmm3
mulps %xmm2, %xmm1
addps %xmm3, %xmm1
movups %xmm1, (%rsi,%rax)
addq $16, %rax
cmpq %rcx, %rax
jne .L4
movl %edx, %eax
andl $-4, %eax
testb $3, %dl
je .L1
movl %eax, %r8d
movss (%rdi,%r8,4), %xmm1
leaq (%rsi,%r8,4), %rcx
mulss %xmm0, %xmm1
addss (%rcx), %xmm1
movss %xmm1, (%rcx)
leal 1(%rax), %ecx
cmpl %ecx, %edx
jle .L1
movslq %ecx, %rcx
addl $2, %eax
movss (%rdi,%rcx,4), %xmm1
leaq (%rsi,%rcx,4), %r8
mulss %xmm0, %xmm1
addss (%r8), %xmm1
movss %xmm1, (%r8)
cmpl %eax, %edx
jle .L1
mulss (%rdi,%rax,4), %xmm0
leaq (%rsi,%rax,4), %rdx
addss (%rdx), %xmm0
movss %xmm0, (%rdx)
.p2align 4,,10
.p2align 3
movl %eax, %edx
xorl %eax, %eax
.p2align 4,,10
.p2align 3
movss (%rdi,%rax,4), %xmm1
movq %rax, %rcx
mulss %xmm0, %xmm1
addss (%rsi,%rax,4), %xmm1
movss %xmm1, (%rsi,%rax,4)
addq $1, %rax
cmpq %rdx, %rcx
jne .L6
.size f, .-f
.p2align 4
.globl f2
.type f2, @function
testl %edx, %edx
jle .L17
leaq 15(%rsi), %rax
leal -1(%rdx), %ecx
subq %rdi, %rax
cmpq $30, %rax
jbe .L19
cmpl $2, %ecx
jbe .L19
movl %edx, %ecx
movaps %xmm0, %xmm7
xorl %eax, %eax
shrl $2, %ecx
shufps $0, %xmm7, %xmm7
salq $4, %rcx
.p2align 4,,10
.p2align 3
movups (%rdi,%rax), %xmm2
movlps 8(%rsi,%rax), %xmm6
mulps %xmm7, %xmm2
movhlps %xmm2, %xmm5
cvtps2pd %xmm2, %xmm1
sqrtpd %xmm1, %xmm4
cvtps2pd (%rsi,%rax), %xmm1
cvtps2pd %xmm5, %xmm2
addpd %xmm4, %xmm1
sqrtpd %xmm2, %xmm3
cvtps2pd %xmm6, %xmm2
addpd %xmm3, %xmm2
cvtpd2ps %xmm1, %xmm1
cvtpd2ps %xmm2, %xmm2
movlhps %xmm2, %xmm1
movups %xmm1, (%rsi,%rax)
addq $16, %rax
cmpq %rcx, %rax
jne .L20
movl %edx, %eax
andl $-4, %eax
testb $3, %dl
je .L17
movl %eax, %r8d
movss (%rdi,%r8,4), %xmm1
leaq (%rsi,%r8,4), %rcx
mulss %xmm0, %xmm1
cvtss2sd %xmm1, %xmm1
movapd %xmm1, %xmm2
pxor %xmm1, %xmm1
sqrtsd %xmm2, %xmm2
cvtss2sd (%rcx), %xmm1
addsd %xmm2, %xmm1
cvtsd2ss %xmm1, %xmm1
movss %xmm1, (%rcx)
leal 1(%rax), %ecx
cmpl %ecx, %edx
jle .L17
movslq %ecx, %rcx
addl $2, %eax
movss (%rdi,%rcx,4), %xmm1
leaq (%rsi,%rcx,4), %r8
mulss %xmm0, %xmm1
cvtss2sd %xmm1, %xmm1
movapd %xmm1, %xmm2
pxor %xmm1, %xmm1
sqrtsd %xmm2, %xmm2
cvtss2sd (%r8), %xmm1
addsd %xmm2, %xmm1
cvtsd2ss %xmm1, %xmm1
movss %xmm1, (%r8)
cmpl %eax, %edx
jle .L17
mulss (%rdi,%rax,4), %xmm0
leaq (%rsi,%rax,4), %rdx
cvtss2sd %xmm0, %xmm0
sqrtsd %xmm0, %xmm0
movapd %xmm0, %xmm1
pxor %xmm0, %xmm0
cvtss2sd (%rdx), %xmm0
addsd %xmm1, %xmm0
cvtsd2ss %xmm0, %xmm0
movss %xmm0, (%rdx)
.p2align 4,,10
.p2align 3
leaq 4(%rsi,%rcx,4), %rax
.p2align 4,,10
.p2align 3
movss (%rdi), %xmm1
addq $4, %rsi
addq $4, %rdi
mulss %xmm0, %xmm1
cvtss2sd %xmm1, %xmm1
movapd %xmm1, %xmm2
pxor %xmm1, %xmm1
sqrtsd %xmm2, %xmm2
cvtss2sd -4(%rsi), %xmm1
addsd %xmm2, %xmm1
cvtsd2ss %xmm1, %xmm1
movss %xmm1, -4(%rsi)
cmpq %rax, %rsi
jne .L22
.size f2, .-f2
.section .text.startup,"ax",@progbits
.p2align 4
.globl main
.type main, @function
xorl %eax, %eax
.size main, .-main
.ident "GCC: (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0"
.section .note.GNU-stack,"",@progbits
.align 8
.long 1f - 0f
.long 4f - 1f
.long 5
.string "GNU"
.align 8
.long 0xc0000002
.long 3f - 2f
.long 0x3
.align 8

# Cours A4
## 16/01 SIMD
-01 pas d'opti
-03 opti avec parallélisation, on transforme en a + b*c
-ffast-math, ignore les erreurs dans les données
si la fonction est de type
for (i=0, i++)
{+ - / *}
float / double
on ne touche pas à la fonction, le compilateur sait vectoriser.
sinon, on va aller chercher dans les fonctions simd
attention aux shuffles, la partie 1 ne contient que des données de a et la partie 2 que des données de b.
## 19/01 OpenMP
Attention aux clauses, si les temps d'exécutions dans les itérations sont identiques, la demande de travail est plus coûteuse que le gain apporté.

