I understood the problem. The right solution is here. "Right" isn't sure "optimal", but works nice anyway, and it isn't hard enough to optimize this code now.
.model tiny
.code
org 100h
start:
; saving old interrupt vector
mov ax, 3521h
int 21h
mov [old_int21h], bx
mov [old_int21h + 2], es
; setting new interrupt vector
cli
push ds
push cs
pop ds
lea dx, myint21h
mov ax, 2521h
int 21h
pop ds
sti
; TSR
mov dx, 00ffh
mov ax, 3100h
int 21h
; here comes data & hew handler part
old_int21h dw ?, ?
myint21h proc
; some stuff
; transfer control to an old interrupt 21h handler
push word ptr [cs:old_int21h + 2] ; segment
push word ptr [cs:old_int21h] ; offset
retf
myint21h endp
end start
The answer below was almost right :)