diff options
author | B. Watson <urchlay@slackware.uk> | 2025-04-22 03:43:03 -0400 |
---|---|---|
committer | B. Watson <urchlay@slackware.uk> | 2025-04-22 03:43:03 -0400 |
commit | 22e209f12b3427d0a89b3e33f2471b426ec46f3d (patch) | |
tree | e298c101875f36ef27d5370e5e03da820ef2bc69 | |
download | atari8-self-relocator-22e209f12b3427d0a89b3e33f2471b426ec46f3d.tar.gz |
initial commit
-rw-r--r-- | .gitignore | 3 | ||||
-rw-r--r-- | Makefile | 18 | ||||
-rw-r--r-- | README.txt | 246 | ||||
-rw-r--r-- | autorun.sys | bin | 0 -> 453 bytes | |||
-rw-r--r-- | dos_20s.atr | bin | 0 -> 92176 bytes | |||
-rw-r--r-- | hello.s | 75 | ||||
-rwxr-xr-x | mkrelocxex.pl | 159 | ||||
-rw-r--r-- | reloc.s | 193 |
8 files changed, 694 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..dc34f28 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +reloc.atr +*.o +*.xex diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..8212e19 --- /dev/null +++ b/Makefile @@ -0,0 +1,18 @@ +reloc.atr: reloc.xex hello40.xex hello41.xex mkrelocxex.pl autorun.sys + cp dos_20s.atr reloc.atr + axe -w autorun.sys reloc.atr + +autorun.sys: reloc.xex hello40.xex hello41.xex + ./mkrelocxex.pl hello40.xex hello41.xex autorun.sys + +reloc.xex: reloc.s + cl65 -t none -o reloc.xex reloc.s + +hello40.xex: hello.s + cl65 -t none -o hello40.xex --asm-define start_addr=0x4000 hello.s + +hello41.xex: hello.s + cl65 -t none -o hello41.xex --asm-define start_addr=0x4102 hello.s + +clean: + rm -f reloc.atr hello40.xex hello41.xex reloc.xex *.o diff --git a/README.txt b/README.txt new file mode 100644 index 0000000..189b886 --- /dev/null +++ b/README.txt @@ -0,0 +1,246 @@ + +How to do a self-relocating Atari 8-bit executable... + +This is a modified form of a technique I saw in Bill Wilkinson's +Insight: Atari column in Compute! magazine (Issue 21, Feb 1982). + +In the original scheme, you'd assemble the code twice, with the origin +(start address) one page apart. Say, assemble at address $4000, then +the 2nd time at $4100. Now, any bytes in the two object files that +differ by 1, are what needs to be changed when relocating. Suppose you +want to relocate to $2000, you just subtract $20 from all the bytes in +the first file that are 1 less than the same byte in the 2nd file. + +This works, and is simple enough. The limitation is, you can only +relocate to an even page boundary. If you want to relocate to the +bottom of memory (pointed to by MEMLO), you probably will waste a few +bytes. In DOS 2.0S, I get $1CFC in MEMLO. Relocating to an even page +boundary means the goes goes at $1D00, and the 4 bytes from $1CFC +to $1D00 are wasted. That's not so bad... but if I enable another +drive in DOS, that bumps MEMLO up by 128 bytes, to $1D7C. Then my +relocatable code ends up at $1E00, and I waste 132 bytes below that... + +In the modified form presented here, the code is still assembled +twice, but the 2nd pass is ORG'ed 258 ($0102) bytes higher than +the first. Now we have bytes that differ by one (the high bytes of +addresses) and others that differ by two (the low bytes). + +Another, more serious limitation of the code from Insight: Atari is +that it doesn't produce self-relocating executables. What it produces +is BASIC programs that have the relocatable object code as DATA +statements, POKEd into memory when run. The relocator presented here +gets appended to your standard executable and relocates it "on the +fly", then jumps to the start of the relocated code. + +Example: a subroutine call to within our own code: + + JSR print_banner + +This is the first instruction in our program, so it will be found +at $4000 for the first assembly pass, and $4102 for the second. + +Say print_banner ends up at $4123 when we assemble at $4000, and $4225 +when assembling at $4102. Further, we determine MEMLO has $1D80. So, +when we relocate the program, it ends up at $1D80. The target of the +JSR instruction has to be adjusted to match the new location where +print_banner is going to be. + +The code that does the relocation, we'll call the relocator. The term +"relocating loader" is used elsewhere, but it's not accurate here: DOS +is the loader, and we're not replacing it. + +The relocator is a small routine that gets appended to the first +executable (the $4000 one) as a segment, plus two data tables (one +each for low and high bytes), as another 2 segments, plus an INITAD +segment that runs the relocator code. These all have to load at a +fixed address, but once they're finished running, they won't be needed +again. + +The relocator has to know the load address and the length of the main +segment of the program (the part it's going to relocate). What it +does: + +1. Subtract the load address ($4000 in the example) from the contents + of MEMLO. This gives us a negative number (we hope!) that is the + amount each address in the program should have added to it. + +2. Iterate over the two data tables, adding the offset. Each table entry + is the two-byte address of a byte that needs to be changed (an + absolute address that's "baked" into the program). The high and low + bytes of the addresses in the code are handled separately (hence + the two tables). The low byte of the offset is added to the bytes + at the addresses in the low-byte table, and the high byte of the + offset for the high-byte table. + +3. Moves the main segment to MEMLO. + +4. Set MEMLO to point to the byte after the end of the program + to protect it from being overwritten by e.g. BASIC or ASM/ED. + +5. Add the offset to the contents of RUNAD, which is the run address + of the program, and then do an RTS to hand control back to DOS. + DOS will run the relocated code by jumping to the altered RUNAD. + +Notes: + +- To keep things simple, the program must consist of a single + segment of code and data, followed by an init address and/or an run + address. + +- If your program is a device driver or a "TSR", you should use an + init address, NOT a run address. This allows users to append your + program to e.g. an RS-232 driver, and maybe a RAMdisk driver too, + etc. Each driver should have an init address, because Atari + executables can have multiple init addresses. + +- If your program is an application, it's usually better to use a run + address. If you use an init address, your program will run, but DOS + will still be "in the middle of" loading the executable, meaning + IOCB #1 will still be open for reading. + +- The program's end address must be below $6C00, since that's where + the relocator and tables load. The reason for this restriction + is to allow the relocatable executable to work with a 16K cartridge. + The lowest sane start address for the program is probably $2000, + which allows the program to be 19KB in size... though $3000 is + a lot safer (15KB max). + +- Whatever start address (ORG) you use for the program, it has to + be higher than the current MEMLO when the relocation is done. + That's why I said $3000 is safer than $2000: if someone uses a fancy + DOS and/or have lots of device drivers loaded, MEMLO could exceed + $2000, which would cause your program to crash when loaded. + +- The data tables' combined size must not exceed 4K. Generally the + tables will be the same size, and each entry is 2 bytes, so this + means you can't have more than about 1000 absolute references in + your code. This doesn't count references that point outside your + code, like e.g. JMP CIOV or STA CRSINH; these won't be relocated, + or your program wouldn't work. As a reference, the 8K Atari BASIC + cartridge would require 1522 bytes of data tables, if we were trying + to relocate it. + +- The original Wilkinson scheme was done entirely in Atari BASIC. I + use a C program to create the relocation tables, and the relocator + itself becomes part of the relocatable program, so BASIC is not + required. The C program can be run on either the Atari or on + a modern POSIX system, which is especially useful if you use a + cross-assembler to write and assemble your Atari code. + +- Indirect JMP instructions should always be used with care on the + 6502. The two operand bytes have to be in the same page, due to a + 6502 bug. Most 6502 asm programmers know how to handle this... but + with dynamically relocatable code, there's not really a good way to + do it. Best to avoid indirect JMPs. One simple workaround is to use + self-modifying code: Have an absolute JMP instruction in your code, + and store the indirect jump's destination there. Example: + + JMP (VECTOR) + +...becomes: + + LDA VECTOR + STA TRAMPOLINE+1 + LDA VECTOR+1 + STA TRAMPOLINE+2 + JMP TRAMPOLINE + ; somewhere in the code you have this: +TRAMPOLINE JMP $0000 + + Another way to do it would be to use call-by-RTS (push the jump + address minus one on the stack, then execute RTS). + +- If your code has really tight cycle-counted timing loops, the timing + might get thrown off due to relocation causing a branch to cross a + page boundary, when it was originally not supposed to. This kind of + code generally only belongs in games and demos. Relocatable code is + usually used for things like device drivers or programming utilities. + Games "take over" the whole machine and don't have to care about MEMLO + or other software needing free RAM. + +Format of the relocatable executable: + +- Segment with the original code, at the original load address. +- Segment with the relocator code and relocation tables. +- INITAD segment that runs the relocator code. + +Note that the original RUNAD and INITAD segments (if any) don't appear +in the relocatable file as segments. + +Relocation tables start immediately after the last byte of the relocator. + +First 8 bytes are 4 words: +- Original load address +- Original end address +- Original run address (or 0 for none) +- Original init address (or 0 for none) + +The next N bytes are the high-byte relocation table. Each entry +is a word, the address of a byte within the program that has to be +relocated. The table ends with $0000. + +The next N bytes are the low-byte table, same format as the high-byte +table including the $0000 at the end. The high and low byte tables +will generally be the same size, but this is not a requirement. + +For the init address, if it's not zero, the relocator JSR's to it (at its +new location). + +For the run address, if it's not zero, the relocator adjusts RUNAD, +and DOS uses RUNAD as usual when the program's done loading. + +Example: + + *=$4000 +start: + jsr set_color ; $4000 JSR $4007 + jsr set_cursor ; $4003 JSR $400E + rts ; $4006 +set_color: + lda bgcolor ; $4007 LDA $4015 + sta COLOR2 ; $400A + rts ; $400D +set_cursor: + lda cursor ; $400E LDA $4016 + sta CRSINH ; $4011 + rts ; $4014 +bgcolor: .byte $00 ; $4015 +cursor: .byte $01 ; $4016 + *=INITAD + .word start + +The address table for the above program: + +$00 40 - code_start +$16 40 - code_end +$00 00 - code_run (no run address) +$00 40 - code_init + +High byte relocation table: + +$02 $40 ; hi byte of JSR $4007 operand +$05 $40 ; hi byte of JSR $400E operand +$09 $40 ; hi byte of LDA $4015 operand +$10 $40 ; hi byte of LDA $4016 +$00 $00 ; terminator + +Low byte relocation table: + +$01 $40 ; lo byte of JSR $4007 operand +$04 $40 ; lo byte of JSR $400E operand +$08 $40 ; lo byte of LDA $4015 operand +$0F $40 ; lo byte of LDA $4016 +$00 $00 ; terminator + +Program loads from $4000 to $4016. If MEMLO was $1CFC, the relocator +will move the program to $1CFC - $1D12 and set MEMLO to $1D13. The +operand of the first instruction (was JSR $4007) will be altered +to $1D03 (aka $4007 - $4000 + $1CFC), which is the address that the +subroutine got relocated to. + +The original program assembled to a 32-byte file. The relocatable +version will be around 200 bytes: 28 bytes for the original file +(minus its INITAD segment), ~128 bytes for the relocator code, 8 bytes +for the address table, and 20 bytes for the two relocation tables. +However, the relocator and tables are only used once, and can be +overwritten afterwards (so they count as free memory). diff --git a/autorun.sys b/autorun.sys Binary files differnew file mode 100644 index 0000000..31e2914 --- /dev/null +++ b/autorun.sys diff --git a/dos_20s.atr b/dos_20s.atr Binary files differnew file mode 100644 index 0000000..8016b73 --- /dev/null +++ b/dos_20s.atr @@ -0,0 +1,75 @@ + + .export _main + .include "atari.inc" + + sptr = FR0 + scount = FR0+2 + + .ifndef start_addr + start_addr = $4000 + .endif + + .ifndef RAW + .org start_addr - 6 + .word $ffff + .word start_addr + .word end_addr - 1 + .endif + + .org start_addr +_main: + ldx #0 + stx COLCRS + inx + stx CRSINH + lda #<str1 + ldx #>str1 + jsr printstr + lda #' ' + jsr printa + lda #<str2 + ldx #>str2 + jsr printstr + lda #'.' + jsr printa +cycle: + lda RTCLOK+2 + and #$f0 + ora #$06 + sta COLOR2 + jmp cycle + +printstr: + sta sptr + stx sptr+1 + lda #0 + sta scount +strloop: + ldy scount + lda (sptr),y + bne printchr + rts +printchr: + jsr printa + inc scount + bne strloop + rts + +printa: + tax + lda ICPTH ; the print-one-byte vector for IOCB 0. + pha + lda ICPTL ; low byte of vector + pha + txa + rts + +str1: .byte "Hello",0 +str2: .byte "World",0 + + .ifndef RAW +end_addr: + .word INITAD + .word INITAD+1 + .word _main + .endif diff --git a/mkrelocxex.pl b/mkrelocxex.pl new file mode 100755 index 0000000..e4b8a10 --- /dev/null +++ b/mkrelocxex.pl @@ -0,0 +1,159 @@ +#!/usr/bin/perl -w + +# mkrelocxex prototype in perl (will rewrite in C). +# this version only supports init addresses, not run addresses. + +use bytes; + +die "usage: $0 <lo> <hi> <out>\n" unless @ARGV == 3; + +open $lo, "<", $ARGV[0] or die "$ARGV[0]: $!\n"; +open $hi, "<", $ARGV[1] or die "$ARGV[1]: $!\n"; +open $out, ">", $ARGV[2] or die "$ARGV[2]: $!\n"; + +sub read_word { + my $fh = shift; + my ($a, $b); + + read($fh, $a, 1) || return undef; + read($fh, $b, 1) || return undef; + + return ord($a) | (ord($b) << 8); +} + +sub read_header { + my ($start, $end); + my ($a, $b); + my $fh = shift; + $start = read_word($fh) || return undef; + if($start == 0xffff) { + $start = read_word($fh) || return undef; + } + $end = read_word($fh) || return undef; + return ($start, $end); +} + +sub read_seg { + my $fh = shift; + my ($start, $end) = @_; + my @bytes; + for($start..($end)) { + my $b; + read($fh, $b, 1) || die "early EOF, WTF?\n"; + push @bytes, ord($b); + } + return @bytes; +} + +sub print_table { + my ($name, $t) = @_; + print "\n$name byte table:\n"; + my $i = 0; + for(@$t) { + printf "\$%04x ", $_; + print "\n" if $i && ($i & 10 == 0); + } + print "\n"; +} + +($start, $end) = read_header($lo); +($hi_start, $hi_end) = read_header($hi); + +printf("lo start/end: \$%04x/\$%04x\n", $start, $end); +printf("hi start/end: \$%04x/\$%04x\n", $hi_start, $hi_end); + +if(($hi_start != ($start + 0x0102)) || ($hi_end != ($end + 0x0102))) { + die "mismatched segment lengths\n"; +} + +@bytes = read_seg($lo, $start, $end); +@hi_bytes = read_seg($hi, $hi_start, $hi_end); + +for($i = 0; $i < @bytes; $i++) { + my ($a, $b) = ($bytes[$i], $hi_bytes[$i]); + next if $a == $b; + if($b == ($a + 1)) { + push @hi_table, ($i + $start); + } elsif($b == ($a + 2)) { + push @lo_table, ($i + $start); + } else { + die "invalid difference (not 1 or 2)\n"; + } +} + +push(@hi_table, 0); +push(@lo_table, 0); + +print_table("hi", \@hi_table); +print_table("lo", \@lo_table); + +($istart, $iend) = read_header($lo); +warn "istart $istart iend $iend\n"; +if($istart == 0x2e2 && $iend == 0x2e3) { + $init = read_word($lo); +} + +# OK, make the output file now... +print $out chr(0xff); +print $out chr(0xff); +warn $start; +print $out chr($start & 0xff); +print $out chr($start >> 8); +print $out chr($end & 0xff); +print $out chr($end >> 8); +print $out chr($_) for @bytes; + +open $r, "<", "reloc.xex" || die $!; +(undef) = read_word($r); +$rstart = read_word($r); +$rend = read_word($r); +$rlen = $rend - $rstart + 1; +read $r, $rcode, $rlen; +close $r; + +# 8-byte address table +$rcode .= chr($start & 0xff); +$rcode .= chr($start >> 8); +$rcode .= chr($end & 0xff); +$rcode .= chr($end >> 8); +$rcode .= chr(0); +$rcode .= chr(0); +$rcode .= chr($init & 0xff); +$rcode .= chr($init >> 8); + +for(@hi_table) { + $rcode .= chr($_ & 0xff); + $rcode .= chr($_ >> 8); +} + +for(@lo_table) { + $rcode .= chr($_ & 0xff); + $rcode .= chr($_ >> 8); +} + +$rend = $rstart + length($rcode) - 1; + +warn "$rstart $rend " . length($rcode); + +# don't really need a ffff header, makes it easier to read hexdumps. +print $out chr(0xff); +print $out chr(0xff); + +# segment start/end +print $out chr($rstart & 0xff); +print $out chr($rstart >> 8); +print $out chr($rend & 0xff); +print $out chr($rend >> 8); + +# segment contents (code + tables) +print $out $rcode; + +# init address +print $out chr(0xe2); +print $out chr(0x02); +print $out chr(0xe3); +print $out chr(0x02); +print $out chr($rstart & 0xff); +print $out chr($rstart >> 8); + +close $out; @@ -0,0 +1,193 @@ + + .export _main + .include "atari.inc" + + start_addr = $6c00 + + ; mkrelocxex.c appends this stuff. + code_start = end_addr + code_end = end_addr+2 + code_run = end_addr+4 + code_init = end_addr+6 + table = end_addr+8 + + zp_addr = FR0 + offset_lo = zp_addr + offset_hi = zp_addr+1 + table_ptr = zp_addr+2 ; 2 bytes + dest_ptr = table_ptr + code_ptr = zp_addr+4 ; 2 bytes + fixup = zp_addr+6 + + .org start_addr - 6 + .word $ffff + .word start_addr + .word end_addr - 1 + +_main: + lda code_start + sec + sbc MEMLO + + sta offset_lo + lda code_start+1 + sbc MEMLO+1 + sta offset_hi + + bcs memlo_ok + + ; whoops, MEMLO is too high +whoops: + ldx #0 + lda #<whoops_msg + sta ICBAL + lda #>whoops_msg + sta ICBAL+1 + lda #whoops_len + sta ICBLL + stx ICBLH + lda #PUTCHR + sta ICCOM + jsr CIOV +exitwait: + lda CH + cmp #$ff + beq exitwait + lda #$ff + sta CH + lda #0 + sta COLOR2 + rts + +memlo_ok: + ; 1st fixup pass, hi bytes: table comes right after our code + sta fixup + lda #<table + sta table_ptr + lda #>table + sta table_ptr+1 + jsr fixup_addrs + + ; 2nd fixup pass, lo bytes: table_ptr already points to table + lda offset_lo + sta fixup + jsr fixup_addrs + + ; absolute addresses are fixed up, now move the code. + lda code_start + sta code_ptr + lda code_start+1 + sta code_ptr+1 + lda MEMLO + sta dest_ptr + lda MEMLO+1 + sta dest_ptr+1 + + ; x = (code_end >> 8) - (code_start >> 8) + 2 + lda code_end+1 + sec + sbc code_start+1 + tax + inx + inx + ldy #0 + + ; this moves a page at a time, meaning if code_end isn't + ; on an even page boundary, we move a little more than + ; needed. it won't hurt anything, and it follows the + ; KISS principle. +move_loop: + lda (code_ptr),y + sta (dest_ptr),y + iny + bne move_loop + inc code_ptr+1 + inc dest_ptr+1 + dex + bne move_loop + + + ; bump MEMLO to point one byte past the end of the moved code. + lda code_end + sec + sbc code_start + sta code_end + lda code_end+1 + sbc code_start+1 + sta code_end+1 + inc code_end + bne ceok + inc code_end+1 ; code_end is now the code length + 1 byte +ceok: + lda code_end + clc + adc MEMLO + sta MEMLO + lda code_end+1 + adc MEMLO+1 + sta MEMLO+1 ; MEMLO now MEMLO + code length + 1 byte + + ; is RUNAD in our code space? If not, it points somewhere + ; within DOS, and shouldn't be altered. + lda RUNAD+1 + cmp code_start+1 + bcc do_init + + ; fix up RUNAD + lda RUNAD + sec + sbc offset_lo + lda RUNAD+1 + sbc offset_hi + +do_init: + ; if there's an init address, call it (just like DOS would). + lda code_init+1 + beq done ; if hi byte is 0, assume lo byte is also 0. + + lda code_init ; subtract offset + sec + sbc offset_lo + sta code_init + lda code_init+1 + sbc offset_lo + sta code_init+1 + + jmp (code_init) + + ; done +done: + rts + +fixup_addrs: + ldy #1 + lda (table_ptr),y + sta code_ptr+1 + dey + lda (table_ptr),y + sta code_ptr + inc table_ptr ; point to next entry + bne tp1ok + inc table_ptr+1 +tp1ok: + inc table_ptr + bne tp2ok + inc table_ptr+1 +tp2ok: + ora code_ptr+1 ; quit if we hit $0000 in the table + beq done + lda (code_ptr),y ; Y still 0 + sec + sbc fixup + sta (code_ptr),y + jmp fixup_addrs + +whoops_msg: .byte "MEMLO is too high! Press any key to exit.", EOL + whoops_len = (*-whoops_msg) + +end_addr: + +; this was for testing only. mkrelocxex.c adds the init address. +; .word INITAD +; .word INITAD+1 +; .word _main |