;***************************************************************************
; unit:    raster      release 0.37                                        *
; purpose: general manipulation n dimensional matrices n = 1, 2 and 3.     *
;          Use this file or rasterc.c. You cannot link both files together *
; licency:     GPL or LGPL                                                 *
; Copyright: (c) 1998-2025 Jaroslav Fojtik                                 *
;***************************************************************************

.CODE             ;Indicates the start of a code segment.


;void Conv4_8_MMX(BYTE *Dest, const BYTE *Src, unsigned Size1D)
        public  Conv4_8_MMX
Conv4_8_MMX proc \
        uses rdi rsi
;       Dest:ptr byte,	RCX
;       Src:ptr byte,	RDX
;       count:DWORD	R8

	mov     rdi,rcx		; rdi=first pointer
	jrcxz	toend		; NULL dst pointer
        mov     rcx,R8		; cx=amount of pixels

        mov     rsi,rdx		; rdx second pointer

        sub	rcx,8
        jl	PIXEL1

	mov	eax,0F0F0F0F0h
	movd	mm4,eax
	mov	eax, 0F0F0F0Fh
	movd	mm3,eax
PIXEL8:	movd	mm0,dword ptr[rsi]	; 87654321
	movq	mm1,mm0
	add	rsi,4
	pand	mm0,mm3			; -7-5-3-1
	movq	mm2,mm0
	psllw	mm2,4			; 7-5-3-1-	no shift operation for bytes - never mind use words
	por	mm0,mm2			; 77553311

	pand	mm1,mm4			; 8-6-4-2-
	movq	mm2,mm1
	psrlw	mm2,4			; -8-6-4-2
	por	mm1,mm2			; 88664422
	
	punpcklbw mm1,mm0
	movq	qword ptr[rdi],mm1	; 8877665544332211
	add	rdi,8
	sub	rcx,8
	jae	PIXEL8
	emms	

PIXEL1:	add	rcx,8
	jz	toend			; array has zero size or all done
        cld
PIXEL:	lodsb
	mov	ah,al
	mov	dx,ax		; 21 21
	rol	ax,4		; 12 12
	and	dx,00FF0h	;  2 1
	and	ax,0F00Fh	; 2   1
	or	ax,dx
	sub	rcx,2
	jb	ToEndStor1
	stosw
	jnz	PIXEL

toend:
        ret                     ; _cdecl return
        
ToEndStor1:			; one remaining byte needs to be stored
	stosb
	ret        
                
Conv4_8_MMX endp


;*************************************************************************************


;void Conv4_16_MMX(WORD *Dest, const BYTE *Src, unsigned Size1D)
        public  Conv4_16_MMX
Conv4_16_MMX proc \
	uses rdi rsi
;       Dest:ptr byte,	RCX
;       Src:ptr byte,	RDX
;       count:DWORD	R8        

        mov     rdi,rcx		; rdi = destination pointer
        or	rcx,rcx
	jz	toend		; NULL ptr in dst
	mov     rcx,R8		; rcx=amount of pixels

        mov     rsi,rdx		; rsi = source pointer
        or	rdx,rdx
	jz	toend		; array has zero size

        sub	rcx,8
        jl	PIXEL1

	mov	eax,0F0F0F0F0h
	movd	mm4,eax
	mov	eax, 0F0F0F0Fh
	movd	mm3,eax
PIXEL8:	movd	mm0,dword ptr[rsi]	; 87654321
	movq	mm1,mm0
	add	rsi,4
	pand	mm0,mm3			; -7-5-3-1
	movq	mm2,mm0
	psllw	mm2,4			; 7-5-3-1-	no shift operation for bytes - never mind use words
	por	mm0,mm2			; 77553311

	pand	mm1,mm4			; 8-6-4-2-
	movq	mm2,mm1
	psrlw	mm2,4			; -8-6-4-2
	por	mm1,mm2			; 88664422
	
	punpcklbw mm1,mm0		; 8877665544332211
	movq	mm0,mm1
	punpcklbw mm0,mm0
	movq	qword ptr[rdi],mm0	; 4444333322221111
	punpckhbw mm1,mm1
	movq	qword ptr[rdi+8],mm1	; 8888777766665555
	add	rdi,16
	sub	rcx,8
	jae	PIXEL8
	emms	

PIXEL1:	add	rcx,8
	jz	toend			; array has zero size or all done
        cld
PIXEL:	lodsb
	mov	ah,al
	mov	dx,ax
	sal	eax,16
	mov	ax,dx
	mov	edx,eax		; 21 21 21 21
	
	rol	eax,4		; 12 12 12 12
	and	edx,00F0FF0F0h	;  2  2 1  1
	and	eax,0F0F00F0Fh	; 1  1   2  2
	or	eax,edx
	sub	rcx,2
	jb	ToEndStor1	; only 1 pixel is remaining
	stosd
	jnz	PIXEL        

toend:
        ret                     ; _cdecl return
        
ToEndStor1:
	stosw
	ret        
                
Conv4_16_MMX endp


;*************************************************************************************


;void Conv4_32_MMX(DWORD *Dest, const BYTE *Src, unsigned Size1D)
        public  Conv4_32_MMX
Conv4_32_MMX proc \
	uses rdi rsi
;       Dest:ptr byte,	RCX
;       Src:ptr byte,	RDX
;       count:DWORD	R8

        mov     rdi,rcx		; rdi = destination pointer
	or	rcx,rcx
	jz	toend		; NULL dst ptr
        mov     rcx,R8		; cx=amount of pixels        

        mov     rsi,rdx		; rsi = source pointer
	sub	rcx,8
        jl	PIXEL1

	mov	eax,0F0F0F0F0h
	movd	mm4,eax
	mov	eax, 0F0F0F0Fh
	movd	mm3,eax
PIXEL8:	movd	mm0,dword ptr[rsi]	; 87654321
	movq	mm1,mm0
	add	rsi,4
	pand	mm0,mm3			; -7-5-3-1
	movq	mm2,mm0
	psllw	mm2,4			; 7-5-3-1-	no shift operation for bytes - never mind use words
	por	mm0,mm2			; 77553311

	pand	mm1,mm4			; 8-6-4-2-
	movq	mm2,mm1
	psrlw	mm2,4			; -8-6-4-2
	por	mm1,mm2			; 88664422
	
	punpcklbw mm1,mm0		; 8877665544332211
	movq	mm0,mm1
	punpcklbw mm0,mm0		; 4444333322221111
	movq	mm2,mm0
	punpcklwd mm2,mm2
	movq	qword ptr[rdi],mm2	; 2222222211111111
	punpckhwd mm0,mm0
	movq	qword ptr[rdi+8],mm0	; 4444444433333333	
	punpckhbw mm1,mm1
	movq	mm2,mm1
	punpcklwd mm2,mm2
	movq	qword ptr[rdi+16],mm2	;6666666655555555
	punpckhwd mm1,mm1
	movq	qword ptr[rdi+24],mm1	;8888888877777777
	add	rdi,32
	sub	rcx,8
	jae	PIXEL8
	emms	

PIXEL1:	add	rcx,8
	jz	toend			; array has zero size or all done
        cld
PIXEL:	lodsb
	mov	ah,al
	mov	dx,ax
	sal	eax,16
	mov	ax,dx
	mov	edx,eax		; 21 21 21 21
	
	rol	eax,4		; 12 12 12 12
	and	edx,00F0FF0F0h	;  2  2 1  1
	and	eax,0F0F00F0Fh	; 1  1   2  2
	or	eax,edx
	mov	edx,eax
	rol	eax,16
	xchg	ax,dx	
	stosd
	
	mov	eax,edx		; 2nd pixel		
	sub	rcx,2
	jb	ToEnd
	stosd			; prezerves ZF
	jnz	PIXEL

toend:
        ret                     ; _cdecl return
                
Conv4_32_MMX endp


;*************************************************************************************


;void Conv4_64_MMX(DWORD *Dest, const BYTE *Src, unsigned Size1D)
        public  Conv4_64_MMX
Conv4_64_MMX proc \
	uses rdi rsi
;       Dest:ptr byte,	RCX
;       Src:ptr byte,	RDX
;       count:DWORD	R8

        mov     rdi,rcx		; rdi = destination pointer
        mov     rcx,R8		; cx=amount of pixels
        mov     rsi,rdx		; rsi = source pointer

	sub	rcx,8
        jl	PIXEL1

	mov	eax,0F0F0F0F0h
	movd	mm4,eax
	mov	eax, 0F0F0F0Fh
	movd	mm3,eax
PIXEL8:	movd	mm0,dword ptr[rsi]	; 87654321
	movq	mm1,mm0
	add	rsi,4
	pand	mm0,mm3			; -7-5-3-1
	movq	mm2,mm0
	psllw	mm2,4			; 7-5-3-1-	no shift operation for bytes - never mind use words
	por	mm0,mm2			; 77553311

	pand	mm1,mm4			; 8-6-4-2-
	movq	mm2,mm1
	psrlw	mm2,4			; -8-6-4-2
	por	mm1,mm2			; 88664422
	
	punpcklbw mm1,mm0		; 8877665544332211
	movq	mm0,mm1
	punpcklbw mm0,mm0		; 4444333322221111
	movq	mm2,mm0
	punpcklwd mm2,mm2		; 2222222211111111
	movq	mm5,mm2
	punpckldq mm5,mm5
	movq	qword ptr[rdi],mm5	; 1111111111111111
	punpckhdq mm2,mm2
	movq	qword ptr[rdi+8],mm2	; 2222222222222222
	punpckhwd mm0,mm0		; 4444444433333333	
	movq	mm5,mm0
	punpckldq mm5,mm5
	movq	qword ptr[rdi+16],mm5	; 3333333333333333
	punpckhdq mm0,mm0
	movq	qword ptr[rdi+24],mm0	; 4444444444444444
	punpckhbw mm1,mm1
	movq	mm2,mm1
	punpcklwd mm2,mm2
	movq	mm5,mm2
	punpckldq mm5,mm5
	movq	qword ptr[rdi+32],mm5	; 5555555555555555
	punpckhdq mm2,mm2
	movq	qword ptr[rdi+40],mm2	; 6666666666666666
	punpckhwd mm1,mm1
	movq	mm5,mm1
	punpckldq mm5,mm5
	movq	qword ptr[rdi+48],mm5	; 7777777777777777
	punpckhdq mm1,mm1
	movq	qword ptr[rdi+56],mm1	; 8888888888888888
	add	rdi,64
	sub	rcx,8
	jae	PIXEL8
	emms

PIXEL1:	add	rcx,8
	jz	toend			; array has zero size or all done
        cld        
	mov	R9, 1111111111111111h
PIXEL:	movzx	rax,byte ptr [rsi]
	mov	R8,rax
	inc	rsi	
	shr	rax,4
	mul	R9		; rdx is cleared
	stosq
	dec	rcx
	jz	toend
	mov	rax,R8
	and	al,0Fh	
	mul	R9
	stosq
	loop	PIXEL

toend:
        ret                     ; _cdecl return
                
Conv4_64_MMX endp


;*************************************************************************************


        public  Conv8_4_MMX
Conv8_4_MMX proc \
        uses rdi rsi
;       Dest:ptr byte,
;       Src:ptr byte,
;       count:DWORD
        
	mov     rdi,rcx		; rdi=first pointer
	jrcxz	toend		; NULL ptr
        mov     rcx,R8		; cx=amount of pixels        
        mov     rsi,rdx		;

       	sub	rcx,8
        jl	PIXEL1
        
        mov	eax,00F000F0h
        movd	mm3,eax
        punpckldq mm3,mm3
PIXEL8: movq	mm0,qword ptr [rsi]	; 3h 3l|2h 2l|1h 1l|0h 0l
	movq	mm1,mm0
	add	rsi,8
	pand	mm0,mm3			;  -  -  2h -| -  -  0h - 
	psrlw	mm1,12			;  -  -  - 3h| -  -  - 1h
	por	mm0,mm1			;  -  -  2h3h| -  -  0h1h
	packuswb mm0,mm0
	movd	dword ptr [rdi],mm0
	add	rdi,4
	sub	rcx,8
        jae	PIXEL8
	emms	
        
PIXEL1:	add	rcx,8
        jz	ToEnd        
        cld
PIXEL:	lodsb			; load 1st byte
	and	al,0F0h
	
	dec	rcx
	jnz	NIBBLE2
	stosb			;store incomplete nibble
	jmp	toend

NIBBLE2:mov	ah,al
        lodsb			; load 2nd byte
        and	al,0F0h
	ror	al,4
	or	al,ah
	stosb			;store both nibbles
	loop	PIXEL
        
toend:
        ret                     ; _cdecl return
                
Conv8_4_MMX endp



;*************************************************************************************


        public  Conv8_16_MMX
Conv8_16_MMX proc \
	uses rdi rsi
;       Dest:ptr qword, \
;       Src:ptr byte, \
;       count:DWORD

        mov     rdi,rcx		; rdi=first pointer
        mov     rcx,R8		; cx=amount of pixels

        mov     rsi,rdx		;
        sub	rcx,4
        jl	PIXEL1

PIXEL4:	movd	mm0,dword ptr[rsi]			; pixels 1,2,3,4
	add	rsi,4
	punpcklbw mm0,mm0
	movq	qword ptr [rdi],mm0

	add	rdi,8
	sub	rcx,4
        jae	PIXEL4
	emms

PIXEL1: add	rcx,4
        jz	ToEnd		; array has zero size               
	cld
PIXEL:	lodsb
	mov	ah,al
	stosw
	loop	PIXEL
        
ToEnd:	ret			; _cdecl return
                
Conv8_16_MMX endp


;*************************************************************************************

        public  Conv8_32_MMX
Conv8_32_MMX proc \
        uses rdi rsi
;       Dest:ptr qword, \
;       Src:ptr byte, \
;       count:DWORD

        mov     rdi,rcx		; rdi=first pointer
        mov     rcx,R8		; cx=amount of pixels

        mov     rsi,rdx		;

        sub	rcx,4
        jl	PIXEL1
        
PIXEL4: movd	mm0,dword ptr[rsi]			; pixels 1,2,3,4
	add	rsi,4
	punpcklbw mm0,mm0
	movq	mm1,mm0
	punpcklwd mm0,mm0	
	movq	qword ptr [rdi],mm0
	punpckhdq mm1,mm1				; 4 3 4 3
	punpcklwd mm1,mm1
	movq	qword ptr [rdi+8],mm1
	add	rdi,16
	sub	rcx,4
        jae	PIXEL4
        emms

PIXEL1: add	rcx,4
	jz	toend		; array has zero size
	cld
PIXEL:	lodsb
	mov	ah,al
	mov	dx,ax
	rol	eax,16
	mov	ax,dx
	stosd
	loop	PIXEL
        
toend:
        ret                     ; _cdecl return
                
Conv8_32_MMX endp



;*************************************************************************************

        public  Conv8_64_MMX
Conv8_64_MMX proc \
        uses rdi rsi
;       Dest:ptr qword, \
;       Src:ptr byte, \
;       count:DWORD

        mov     rdi,rcx		; rdi=first pointer
        mov     rcx,R8		; cx=amount of pixels

        mov     rsi,rdx		;

        sub	rcx,4
        jl	PIXEL1
        
PIXEL4: movd	mm0,dword ptr[rsi]			; pixels 1,2,3,4
	add	rsi,4
	punpcklbw mm0,mm0
	movq	mm1,mm0
	punpcklwd mm0,mm0
	movq	mm3,mm0
	punpckldq  mm0,mm3	
	movq	qword ptr [rdi],mm0
	movq	mm0,mm3
	punpckhdq  mm0,mm3	
	movq	qword ptr [rdi+8],mm0
	
	punpckhdq mm1,mm1				; 4 3 4 3
	punpcklwd mm1,mm1
	movq	mm3,mm1
	punpckldq  mm1,mm3
	movq	qword ptr [rdi+16],mm1
	movq	mm0,mm3
	punpckhdq  mm0,mm3	
	movq	qword ptr [rdi+24],mm0
	
	add	rdi,32
	sub	rcx,4
        jae	PIXEL4
        emms

PIXEL1: add	rcx,4
	jz	toend		; array has zero size
        cld
	mov	R8, 101010101010101h
PIXEL:	xor	rax,rax
	lodsb
	mul	R8		; RDX is cleared!
	stosq
	loop	PIXEL
        
toend:
        ret                     ; _cdecl return
                
Conv8_64_MMX endp


;*************************************************************************************

        public  Conv16_64_MMX
Conv16_64_MMX proc \
        uses rdi rsi
;       Dest:ptr qword, \
;       Src:ptr byte, \
;       count:DWORD

        mov     rdi,rcx		; rdi=first pointer
        mov     rcx,R8		; cx=amount of pixels

        mov     rsi,rdx		;

        sub	rcx,2
        jl	PIXEL1
        
PIXEL2: movd	mm0,dword ptr[rsi]			; pixels 1,2
	add	rsi,4
	punpcklwd mm0,mm0				; 2 2 2 2 1 1 1 1
	movq	mm1,mm0
	punpckldq mm0,mm0
	movq	qword ptr [rdi],mm0
	punpckhdq  mm1,mm1
	movq	qword ptr [rdi+8],mm1
	
	add	rdi,16
	sub	rcx,2
        jae	PIXEL2
        emms

PIXEL1: add	rcx,2
	jz	toend		; array has zero size
        cld
	mov	R8, 001000100010001h
PIXEL:	xor	rax,rax
	lodsw
	mul	R8		; RDX is cleared!
	stosq
	loop	PIXEL
        
toend:
        ret                     ; _cdecl return
                
Conv16_64_MMX endp


;*************************************************************************************


        public  Conv16_4_MMX
Conv16_4_MMX proc \
        uses rdi rsi
;       Dest:ptr byte, \
;       Src:ptr word, \
;       count:DWORD

        mov     rdi,rcx		; rdi=first pointer
        mov     rcx,R8		; cx=amount of pixels
        mov     rsi,rdx		;
        
       	sub	rcx,8
        jl	PIXEL1
        
        mov	eax,00F000F0h
        movd	mm3,eax
        punpckldq mm3,mm3
PIXEL8: movq	mm0,qword ptr [rsi]
	psrlw	mm0,8
	movq	mm1,qword ptr [rsi+8]
	psrlw	mm1,8
	packuswb mm0,mm1		; 3h 3l|2h 2l|1h 1l|0h 0l	
	movq	mm1,mm0
	add	rsi,16
	pand	mm0,mm3			;  -  -  2h -| -  -  0h - 
	psrlw	mm1,12			;  -  -  - 3h| -  -  - 1h
	por	mm0,mm1			;  -  -  2h3h| -  -  0h1h
	packuswb mm0,mm0
	movd	dword ptr [rdi],mm0
	add	rdi,4
	sub	rcx,8
        jae	PIXEL8
	emms	
        
PIXEL1:	add	rcx,8
        jz	ToEnd        
        cld
PIXEL:	inc	rsi
	lodsb			; load 1st hi byte
	and	al,0F0h
	
	dec	rcx
	jnz	NIBBLE2
	stosb			;store incomplete nibble
	jmp	toend

NIBBLE2:mov	ah,al
	inc	rsi
        lodsb			; load 2nd byte
        and	al,0F0h
	ror	al,4
	or	al,ah
	stosb			;store both nibbles
	loop	PIXEL
        
toend:
        ret                     ; _cdecl return
                
Conv16_4_MMX endp


;*************************************************************************************


        public  Conv16_8_MMX
Conv16_8_MMX proc \
;       Dest:ptr qword, \
;       Src:ptr byte, \
;       count:DWORD

        mov     rdi,rcx		; rdi=first pointer
        mov     rcx,R8		; cx=amount of pixels
        
        mov     rsi,rdx		;
        
	sub	rcx,4
        jl	PIXEL1

PIXEL4:	movq	mm0,qword ptr[rsi]			; pixels 1,2,3,4
	add	rsi,8
	psrlw	mm0,8
	packuswb mm0,mm0
	movd	dword ptr [rdi],mm0

	add	rdi,4
	sub	rcx,4
        jae	PIXEL4
	emms

PIXEL1:	add	rcx,4
	jz	ToEnd
	cld
PIXEL:	lodsw
	mov	al,ah
	stosb
	loop	PIXEL
ToEnd:
        ret                     ; _cdecl return
        
        Conv16_8_MMX endp


;*************************************************************************************

        public  Conv16_32_MMX
Conv16_32_MMX proc \
        uses rdi rsi
;       Dest:ptr qword, \
;       Src:ptr byte, \
;       count:DWORD

        mov     rdi,rcx		;
        mov     rcx,R8             ; cx=amount of pixels

        mov     rsi,rdx		; rdi=first pointer

	sub	rcx,2
        jl	PIXEL1
        
PIXEL2: movd	mm0,dword ptr[rsi]			; pixels 1,2
	add	rsi,4
	punpcklwd mm0,mm0
	movq	qword ptr [rdi],mm0
	add	rdi,8
	sub	rcx,2
        jae	PIXEL2
        emms

PIXEL1: add	rcx,2
	jz	ToEnd        
        cld
PIXEL:	lodsw
	mov	dx,ax
	rol	eax,16
	mov	ax,dx
	stosd
	loop	PIXEL
        
toend:
        ret                     ; _cdecl return
                
Conv16_32_MMX endp


;*************************************************************************************


        public  Conv32_16_MMX
Conv32_16_MMX proc \
        uses rdi rsi
;       Dest:ptr qword, \
;       Src:ptr byte, \
;       count:DWORD

        mov     rdi,rcx		;
        mov     rcx,R8             ; cx=amount of pixels

        mov     rsi,rdx		; rdi=first pointer

	sub	rcx,2
        jl	PIXEL1
        
PIXEL2: movq	mm0,qword ptr[rsi]			; dword pixels 1,2
	add	rsi,8
	psrld	mm0,16
	packssdw mm0,mm0
	movd	dword ptr [rdi],mm0
	add	rdi,4
	sub	rcx,2
        jae	PIXEL2
        emms

PIXEL1: add	rcx,2
	jz	ToEnd        
        cld
PIXEL:	add	rsi,2
	movsw
	loop	PIXEL
        
toend:
        ret                     ; _cdecl return
                
Conv32_16_MMX endp


;*************************************************************************************


        public  Conv32_64_MMX
Conv32_64_MMX proc \
        uses rsi
;       Dest:ptr qword,
;       Src:ptr byte,
;	count:DWORD

        mov     rsi,rdx		;
        or	rsi,rsi
        jz	ToEnd
        mov     rdx,rcx		; rdi=first pointer
        jrcxz	toend		; NULL pointer	
        mov     rcx,R8		; cx=amount of pixels
        jrcxz	toend		; array has zero size

PIXEL:	movd	mm0,dword ptr [rsi]
	add	rsi,4
	punpckldq  mm0,mm0
	movq	qword ptr [rdx],mm0
	add	rdx,8
	loop	PIXEL
	emms
        
toend:
        ret                     ; _cdecl return
                
Conv32_64_MMX endp



        end
