;***************************************************************************
; unit:    raster      release 0.37                                        *
; purpose: general manipulation n dimensional matrices n = 1, 2 and 3.     *
;          Use this file or rasterc.c. You cannot link both files together *
; licency:     GPL or LGPL                                                 *
; Copyright: (c) 1998-2025 Jaroslav Fojtik                                 *
;***************************************************************************

.CODE             ;Indicates the start of a code segment.


;void Conv4_8_SSE(BYTE *Dest, const BYTE *Src, unsigned Size1D)
        public  Conv4_8_SSE
Conv4_8_SSE proc \
        uses rdi rsi
;       Dest:ptr byte,	RCX
;       Src:ptr byte,	RDX
;       count:DWORD	R8

	mov     rdi,rcx		; rdi=first pointer
	or	rcx,rcx
	jz	toend		; NULL dst pointer
        mov     rcx,R8		; cx=amount of pixels

        mov     rsi,rdx		; rdx second pointer

        sub	rcx,16
        jl	PIXEL1

	mov	eax,0F0F0F0F0h
	movd	xmm4,eax
	punpcklbw xmm4,xmm4
	mov	eax, 0F0F0F0Fh
	movd	xmm3,eax
	punpcklbw xmm3,xmm3
PIXEL16:movq	xmm0,qword ptr[rsi]	; FEDCBA9876543210
	movq	xmm1,xmm0
	add	rsi,8
	pand	xmm0,xmm3		; -E-C-A-8-6-4-2-0
	movq	xmm2,xmm0
	psllw	xmm2,4			; E-C-A-8-6-4-2-0-    no shift operation for bytes - never mind use words
	por	xmm0,xmm2		; EECCAA8866442200

	pand	xmm1,xmm4		; F-D-B-9-7-5-3-1-
	movq	xmm2,xmm1
	psrlw	xmm2,4			; -F-D-B-9-7-5-3-1
	por	xmm1,xmm2		; FFDDBB9977553311
	
	punpcklbw xmm1,xmm0
	movaps	[rdi],xmm1		; FEEEDDCCBBAA99887766554433221100
	add	rdi,16
	sub	rcx,16
	jae	PIXEL16
;	emms

PIXEL1:	add	rcx,16
	jz	toend			; array has zero size or all done
        cld
PIXEL:	lodsb
	mov	ah,al
	mov	dx,ax		; 21 21
	rol	ax,4		; 12 12
	and	dx,00FF0h	;  2 1
	and	ax,0F00Fh	; 2   1
	or	ax,dx
	sub	rcx,2
	jb	ToEndStor1
	stosw
	jnz	PIXEL

toend:
        ret                     ; _cdecl return
        
ToEndStor1:			; one remaining byte needs to be stored
	stosb
	ret        
                
Conv4_8_SSE endp


;*************************************************************************************


;void Conv4_16_SSE(WORD *Dest, const BYTE *Src, unsigned Size1D)
        public  Conv4_16_SSE
Conv4_16_SSE proc \
	uses rdi rsi
;       Dest:ptr byte,	RCX
;       Src:ptr byte,	RDX
;       count:DWORD	R8        

        mov     rdi,rcx		; rdi = destination pointer
        or	rcx,rcx
	jz	toend		; NULL ptr in dst
	mov     rcx,R8		; rcx=amount of pixels

        mov     rsi,rdx		; rsi = source pointer
        or	rdx,rdx
	jz	toend		; array has zero size

        sub	rcx,8
        jl	PIXEL1

	mov	eax,0F0F0F0F0h
	movd	xmm4,eax
	mov	eax, 0F0F0F0Fh
	movd	xmm3,eax
PIXEL8:	movd	xmm0,dword ptr[rsi]	; 87654321
	movq	xmm1,xmm0
	add	rsi,4
	pand	xmm0,xmm3			; -7-5-3-1
	movq	xmm2,xmm0
	psllw	xmm2,4			; 7-5-3-1-	no shift operation for bytes - never mind use words
	por	xmm0,xmm2			; 77553311

	pand	xmm1,xmm4			; 8-6-4-2-
	movq	xmm2,xmm1
	psrlw	xmm2,4			; -8-6-4-2
	por	xmm1,xmm2			; 88664422
	
	punpcklbw xmm1,xmm0		; 8877665544332211
	punpcklbw xmm1,xmm1
	movaps	[rdi],xmm1		; 88887777666655554444333322221111
	add	rdi,16
	sub	rcx,8
	jae	PIXEL8
;	emms	

PIXEL1:	add	rcx,8
	jz	toend			; array has zero size or all done
        cld
PIXEL:	lodsb
	mov	ah,al
	mov	dx,ax
	sal	eax,16
	mov	ax,dx
	mov	edx,eax		; 21 21 21 21
	
	rol	eax,4		; 12 12 12 12
	and	edx,00F0FF0F0h	;  2  2 1  1
	and	eax,0F0F00F0Fh	; 1  1   2  2
	or	eax,edx
	sub	rcx,2
	jb	ToEndStor1	; only 1 pixel is remaining
	stosd
	jnz	PIXEL        

toend:
        ret                     ; _cdecl return
        
ToEndStor1:
	stosw
	ret        
                
Conv4_16_SSE endp


;*************************************************************************************


;void Conv4_32_SSE(DWORD *Dest, const BYTE *Src, unsigned Size1D)
        public  Conv4_32_SSE
Conv4_32_SSE proc \
	uses rdi rsi
;       Dest:ptr byte,	RCX
;       Src:ptr byte,	RDX
;       count:DWORD	R8

        mov     rdi,rcx		; rdi = destination pointer
	or	rcx,rcx
	jz	toend		; NULL dst ptr
        mov     rcx,R8		; cx=amount of pixels        

        mov     rsi,rdx		; rsi = source pointer
	sub	rcx,8
        jl	PIXEL1

	mov	eax,0F0F0F0F0h
	movd	xmm4,eax
	mov	eax, 0F0F0F0Fh
	movd	xmm3,eax
PIXEL8:	movd	xmm0,dword ptr[rsi]	; 87654321
	movq	xmm1,xmm0
	add	rsi,4
	pand	xmm0,xmm3			; -7-5-3-1
	movq	xmm2,xmm0
	psllw	xmm2,4			; 7-5-3-1-	no shift operation for bytes - never mind use words
	por	xmm0,xmm2			; 77553311

	pand	xmm1,xmm4			; 8-6-4-2-
	movq	xmm2,xmm1
	psrlw	xmm2,4			; -8-6-4-2
	por	xmm1,xmm2			; 88664422
	
	punpcklbw xmm1,xmm0		; 8877665544332211
	punpcklbw xmm1,xmm1		; 88887777666655554444333322221111
	movaps	xmm0,xmm1
	punpcklwd xmm0,xmm0
	movaps	[rdi],xmm0
	punpckhwd xmm1,xmm1
	movaps	[rdi+16],xmm1

	add	rdi,32
	sub	rcx,8
	jae	PIXEL8
;	emms	

PIXEL1:	add	rcx,8
	jz	toend			; array has zero size or all done
        cld
PIXEL:	lodsb
	mov	ah,al
	mov	dx,ax
	sal	eax,16
	mov	ax,dx
	mov	edx,eax		; 21 21 21 21
	
	rol	eax,4		; 12 12 12 12
	and	edx,00F0FF0F0h	;  2  2 1  1
	and	eax,0F0F00F0Fh	; 1  1   2  2
	or	eax,edx
	mov	edx,eax
	rol	eax,16
	xchg	ax,dx	
	stosd
	
	mov	eax,edx		; 2nd pixel		
	sub	rcx,2
	jb	ToEnd
	stosd			; prezerves ZF
	jnz	PIXEL

toend:
        ret                     ; _cdecl return
                
Conv4_32_SSE endp


;*************************************************************************************


;void Conv4_64_SSE(DWORD *Dest, const BYTE *Src, unsigned Size1D)
        public  Conv4_64_SSE
Conv4_64_SSE proc \
	uses rdi rsi
;       Dest:ptr byte,	RCX
;       Src:ptr byte,	RDX
;       count:DWORD	R8

        mov     rdi,rcx		; rdi = destination pointer
        mov     rcx,R8		; cx=amount of pixels
        mov     rsi,rdx		; rsi = source pointer

	sub	rcx,8
        jl	PIXEL1

	mov	eax,0F0F0F0F0h
	movd	xmm4,eax
	mov	eax, 0F0F0F0Fh
	movd	xmm3,eax
PIXEL8:	movd	xmm0,dword ptr[rsi]	; 87654321
	movq	xmm1,xmm0
	add	rsi,4
	pand	xmm0,xmm3			; -7-5-3-1
	movq	xmm2,xmm0
	psllw	xmm2,4			; 7-5-3-1-	no shift operation for bytes - never mind use words
	por	xmm0,xmm2			; 77553311

	pand	xmm1,xmm4			; 8-6-4-2-
	movq	xmm2,xmm1
	psrlw	xmm2,4			; -8-6-4-2
	por	xmm1,xmm2		; 88664422
	
	punpcklbw xmm1,xmm0		; 8877665544332211
	punpcklbw xmm1,xmm1		; 88887777666655554444333322221111
	movaps	xmm0,xmm1
	punpcklwd xmm0,xmm0		; 44444444333333332222222211111111
	movaps	xmm2,xmm0
	punpcklwd xmm0,xmm0		; 22222222222222221111111111111111
	movaps	[rdi],xmm0
	punpckhwd xmm2,xmm2
	movaps	[rdi+16],xmm2
	
	punpckhwd xmm1,xmm1		; 88888888777777776666666655555555
	movaps	xmm2,xmm1
	punpcklwd xmm1,xmm1		; 66666666666666665555555555555555
	movaps	[rdi+32],xmm1
	punpckhwd xmm2,xmm2		; 88888888888888887777777777777777
	movaps	[rdi+48],xmm2

	add	rdi,64
	sub	rcx,8
	jae	PIXEL8
;	emms

PIXEL1:	add	rcx,8
	jz	toend			; array has zero size or all done
        cld        
	mov	R9, 1111111111111111h
PIXEL:	movzx	rax,byte ptr [rsi]
	mov	R8,rax
	inc	rsi	
	shr	rax,4
	mul	R9		; rdx is cleared
	stosq
	dec	rcx
	jz	toend
	mov	rax,R8
	and	al,0Fh	
	mul	R9
	stosq
	loop	PIXEL

toend:
        ret                     ; _cdecl return
                
Conv4_64_SSE endp


;*************************************************************************************


        public  Conv8_4_SSE
Conv8_4_SSE proc \
        uses rdi rsi
;       Dest:ptr byte,
;       Src:ptr byte,
;       count:DWORD
        
	mov     rdi,rcx		; rdi=first pointer
	jrcxz	toend		; NULL ptr
        mov     rcx,R8		; cx=amount of pixels        
        mov     rsi,rdx		;

       	sub	rcx,16
        jl	PIXEL1
        
        mov	eax,00F000F0h
        movd	xmm3,eax
        punpckldq xmm3,xmm3
        punpckldq xmm3,xmm3
PIXEL16:movaps	xmm0,[rsi]	; 7h 7l|6h 6l|5h 5l|4h 4l | 3h 3l|2h 2l|1h 1l|0h 0l
	movaps	xmm1,xmm0
	add	rsi,16
	pand	xmm0,xmm3	;  -  -  6h -| -  -  4h - | -  -  2h -| -  -  0h - 
	psrlw	xmm1,12		;  -  -  - 7h| -  -  - 5h |-  -  - 3h| -  -  - 1h
	por	xmm0,xmm1	;  -  -  6h7h| -  -  4h5h |-  -  2h3h| -  -  0h1h
	packuswb xmm0,xmm0
	movq	qword ptr [rdi],xmm0
	add	rdi,8
	sub	rcx,16
        jae	PIXEL16
;	emms	
        
PIXEL1:	add	rcx,16
        jz	ToEnd        
        cld
PIXEL:	lodsb			; load 1st byte
	and	al,0F0h
	
	dec	rcx
	jnz	NIBBLE2
	stosb			;store incomplete nibble
	jmp	toend

NIBBLE2:mov	ah,al
        lodsb			; load 2nd byte
        and	al,0F0h
	ror	al,4
	or	al,ah
	stosb			;store both nibbles
	loop	PIXEL
        
toend:
        ret                     ; _cdecl return
                
Conv8_4_SSE endp



;*************************************************************************************


        public  Conv8_16_SSE
Conv8_16_SSE proc \
	uses rdi rsi
;       Dest:ptr qword, \
;       Src:ptr byte, \
;       count:DWORD

        mov     rdi,rcx		; rdi=first pointer
        mov     rcx,R8		; cx=amount of pixels

        mov     rsi,rdx		;
        sub	rcx,16
        jl	PIXEL1

PIXEL16:movaps	xmm0,[rsi]		; pixels 1,2,3,4,5,6,7,8
	movaps	xmm1,xmm0
	add	rsi,16
	punpcklbw xmm0,xmm0
	movaps	[rdi],xmm0
	punpckhbw xmm1,xmm1
	movaps	[rdi+16],xmm1

	add	rdi,32
	sub	rcx,16
        jae	PIXEL16
;	emms

PIXEL1: add	rcx,16
        jz	ToEnd		; array has zero size               
	cld
PIXEL:	lodsb
	mov	ah,al
	stosw
	loop	PIXEL
        
ToEnd:	ret			; _cdecl return
                
Conv8_16_SSE endp


;*************************************************************************************

        public  Conv8_32_SSE
Conv8_32_SSE proc \
        uses rdi rsi
;       Dest:ptr qword, \
;       Src:ptr byte, \
;       count:DWORD

        mov     rdi,rcx		; rdi=first pointer
        mov     rcx,R8		; cx=amount of pixels

        mov     rsi,rdx		;

        sub	rcx,8
        jl	PIXEL1
        
PIXEL8: movq	xmm0,qword ptr[rsi]			; pixels 1,2,3,4
	add	rsi,8
	punpcklbw xmm0,xmm0
	movaps	xmm1,xmm0
	punpcklwd xmm0,xmm0	
	movaps	[rdi],xmm0
	punpckhwd xmm1,xmm1
	movaps	[rdi+16],xmm1
	add	rdi,32
	sub	rcx,8
        jae	PIXEL8
;	emms

PIXEL1: add	rcx,8
	jz	toend		; array has zero size
	cld
PIXEL:	lodsb
	mov	ah,al
	mov	dx,ax
	rol	eax,16
	mov	ax,dx
	stosd
	loop	PIXEL
        
toend:
        ret                     ; _cdecl return
                
Conv8_32_SSE endp



;*************************************************************************************

        public  Conv8_64_SSE
Conv8_64_SSE proc \
        uses rdi rsi
;       Dest:ptr qword, \
;       Src:ptr byte, \
;       count:DWORD

        mov     rdi,rcx		; rdi=first pointer
        mov     rcx,R8		; cx=amount of pixels

        mov     rsi,rdx		;

        sub	rcx,4
        jl	PIXEL1
        
PIXEL4: movd	xmm0,dword ptr[rsi]			; pixels 1,2,3,4
	add	rsi,4
	punpcklbw xmm0,xmm0
	punpcklwd xmm0,xmm0
	movaps	xmm1,xmm0
	punpckldq xmm0,xmm0
	movaps	[rdi],xmm0	
	punpckhdq xmm1,xmm1
	movaps	[rdi+16],xmm1
	
	add	rdi,32
	sub	rcx,4
        jae	PIXEL4
;	emms

PIXEL1: add	rcx,4
	jz	toend		; array has zero size
        cld
	mov	R8, 101010101010101h
PIXEL:	xor	rax,rax
	lodsb
	mul	R8		; RDX is cleared!
	stosq
	loop	PIXEL
        
toend:
        ret                     ; _cdecl return
                
Conv8_64_SSE endp


;*************************************************************************************

        public  Conv16_64_SSE
Conv16_64_SSE proc \
        uses rdi rsi
;       Dest:ptr qword, \
;       Src:ptr byte, \
;       count:DWORD

        mov     rdi,rcx		; rdi=first pointer
        mov     rcx,R8		; cx=amount of pixels

        mov     rsi,rdx		;

        sub	rcx,2
        jl	PIXEL1
        
PIXEL2: movd	xmm0,dword ptr[rsi]			; pixels 1,2
	add	rsi,4
	punpcklwd xmm0,xmm0				; 2 2 2 2 1 1 1 1
	punpckldq xmm0,xmm0
	movaps	[rdi],xmm0	
	add	rdi,16
	sub	rcx,2
        jae	PIXEL2
;	emms

PIXEL1: add	rcx,2
	jz	toend		; array has zero size
        cld
	mov	R8, 001000100010001h
PIXEL:	xor	rax,rax
	lodsw
	mul	R8		; RDX is cleared!
	stosq
	loop	PIXEL
        
toend:
        ret                     ; _cdecl return
                
Conv16_64_SSE endp


;*************************************************************************************


        public  Conv16_4_SSE
Conv16_4_SSE proc \
        uses rdi rsi
;       Dest:ptr byte, \
;       Src:ptr word, \
;       count:DWORD

        mov     rdi,rcx		; rdi=first pointer
        mov     rcx,R8		; cx=amount of pixels
        mov     rsi,rdx		;
        
       	sub	rcx,8
        jl	PIXEL1
        
        mov	eax,00F000F0h
        movd	xmm3,eax
        punpckldq xmm3,xmm3
PIXEL8: movaps	xmm0,[rsi]	; 8888 7777 6666 5555 4444 3333 2222 1111
	add	rsi,16
	psrlw	xmm0,8		; 0088 0077 0066 0055 0044 0033 0022 0011
	packuswb xmm0,xmm0	; 88 77 66 55 44 33 22 11
	movq	xmm1,xmm0
	pand	xmm0,xmm3	
	psrlw	xmm1,12
	por	xmm0,xmm1
	packuswb xmm0,xmm0	; 3h 3l|2h 2l|1h 1l|0h 0l
	movd	dword ptr [rdi],xmm0
	add	rdi,4
	sub	rcx,8
        jae	PIXEL8
;	emms	
        
PIXEL1:	add	rcx,8
        jz	ToEnd        
        cld
PIXEL:	inc	rsi
	lodsb			; load 1st hi byte
	and	al,0F0h
	
	dec	rcx
	jnz	NIBBLE2
	stosb			;store incomplete nibble
	jmp	toend

NIBBLE2:mov	ah,al
	inc	rsi
        lodsb			; load 2nd byte
        and	al,0F0h
	ror	al,4
	or	al,ah
	stosb			;store both nibbles
	loop	PIXEL
        
toend:
        ret                     ; _cdecl return
                
Conv16_4_SSE endp


;*************************************************************************************


        public  Conv16_8_SSE
Conv16_8_SSE proc \
;       Dest:ptr qword, \
;       Src:ptr byte, \
;       count:DWORD

        mov     rdi,rcx		; rdi=first pointer
        mov     rcx,R8		; cx=amount of pixels
        
        mov     rsi,rdx		;
        
	sub	rcx,16
        jl	PIXEL1

PIXEL16:movaps	xmm0,[rsi]			; pixels 1,2,3,4
	movaps	xmm1,[rsi+16]
	add	rsi,32
	psrlw	xmm0,8
	psrlw	xmm1,8
	packuswb xmm0,xmm1
	movaps	[rdi],xmm0

	add	rdi,16
	sub	rcx,16
        jae	PIXEL16
;	emms

PIXEL1:	add	rcx,16
	jz	ToEnd
	cld
PIXEL:	lodsw
	mov	al,ah
	stosb
	loop	PIXEL
ToEnd:
        ret                     ; _cdecl return
        
        Conv16_8_SSE endp


;*************************************************************************************

        public  Conv16_32_SSE
Conv16_32_SSE proc \
        uses rdi rsi
;       Dest:ptr qword, \
;       Src:ptr byte, \
;       count:DWORD

        mov     rdi,rcx		;
        mov     rcx,R8             ; cx=amount of pixels

        mov     rsi,rdx		; rdi=first pointer

	sub	rcx,8
        jl	PIXEL1
        
PIXEL8: movaps	xmm0,[rsi]			; pixels 1,2
	movaps	xmm1,xmm0
	add	rsi,16
	punpcklwd xmm0,xmm0
	movaps	[rdi],xmm0
	punpckhwd xmm1,xmm1
	movaps	[rdi+16],xmm1
	add	rdi,32
	sub	rcx,8
        jae	PIXEL8
;	emms

PIXEL1: add	rcx,8
	jz	ToEnd        
        cld
PIXEL:	lodsw
	mov	dx,ax
	rol	eax,16
	mov	ax,dx
	stosd
	loop	PIXEL
        
toend:
        ret                     ; _cdecl return
                
Conv16_32_SSE endp


;*************************************************************************************


        public  Conv32_16_SSE
Conv32_16_SSE proc \
        uses rdi rsi
;       Dest:ptr qword, \
;       Src:ptr byte, \
;       count:DWORD

        mov     rdi,rcx		;
        mov     rcx,R8             ; cx=amount of pixels

        mov     rsi,rdx		; rdi=first pointer

	sub	rcx,4
        jl	PIXEL1
        
PIXEL4: movaps	xmm0,[rsi]	; dword pixels 1,2,3,4
	add	rsi,16
	psrld	xmm0,16
	packssdw xmm0,xmm0
	movq	qword ptr [rdi],xmm0
	add	rdi,8
	sub	rcx,4
        jae	PIXEL4
;	emms

PIXEL1: add	rcx,4
	jz	ToEnd        
        cld
PIXEL:	add	rsi,2
	movsw
	loop	PIXEL
        
toend:
        ret                     ; _cdecl return
                
Conv32_16_SSE endp


;*************************************************************************************


        public  Conv32_64_SSE
Conv32_64_SSE proc \
        uses rsi
;       Dest:ptr qword,
;       Src:ptr byte,
;	count:DWORD

        mov     rsi,rdx		;
        or	rsi,rsi
        jz	ToEnd
        mov     rdx,rcx		; rdi=first pointer
        jrcxz	toend		; NULL pointer	
        mov     rcx,R8		; cx=amount of pixels
        
        sub	rcx,2
        jl	PIXEL1
        
PIXEL2: movq	xmm0,qword ptr [rsi]
	add	rsi,8
	punpckldq  xmm0,xmm0
	movaps	[rdx],xmm0
	add	rdx,16
	sub	rcx,2
	jae	PIXEL2
        
PIXEL1:	add	rcx,2
	jz	ToEnd
PIXEL:	movd	xmm0,dword ptr [rsi]
	add	rsi,4
	punpckldq  xmm0,xmm0
	movq	qword ptr [rdx],xmm0
	add	rdx,8
	loop	PIXEL
;	emms
        
toend:
        ret                     ; _cdecl return
                
Conv32_64_SSE endp



        end
