Source file: /~heha/enas/Convac-Ätzer/uba.zip/firmware-231018/ubaboot.S

/* Copyright 2017 by Robert Evans (rrevans@gmail.com)
*1903xx	Make this source file more readable,
	USB VID+PID bound to Atmels genuine boot loader "dfu" (Atmel Flip)
+1904xx	Added "ubaboot" string descriptor, added signature, fuse and lock data
+1904xx	Created Windows programmer as replacement for the python script,
	it reads ELF files
+190514	Added short-circuit for EEPROM write: Write on change only (save time)
-190515	Failed to program first 128 bytes
+210407	Added app_spm subroutine at end to allow application software
	to self-modify flash out of RWW section with own idle procedure
	(needed for Leica/Wild GRM card)
	Support for AT90USB(8|16)2, ATmega(8|16|32)U2, ATmega(16|32)U4
	Binary compatible to 190515 code
-231017	Non-working AT90USB code (by writing USBCON twice, damn!)
+231018	Optional free !HWB pin, configurable by editing Makefile
*231018	fuses out of ELF file, avrdude flashes ELF file
*231018	No more linker script, no more 0xFF filled application space
-231018	Misleading USB PID for each chip except ATmega32U4 (cosmetic)
*231018	Binary compatibility dropped: New, more useful entry at end-18

Features/commands:
  Write and read flash memory
  Write and read eeprom memory
  Read signature bytes
  Read lock/fuse bits
  Reboot (jump) into user program

These are implemented as a vendor-defined protocol. See README for details.
The sample ôpyusbö driver can upload and verify programs.
The (my = Henrik Haftmann) Windows program doesn't need python.

Booting disables the watchdog and clears MCUSR which is preserved in R2.
Jumps to user program at 0x0000 for any reset reason except from !RESET pin.
Note that application software cannot rely on MCUSR content,
the reset reason must be detected by inspecting R2.
The CPU clock is not divided by 8 at startup.

Best way to enter the bootloader by user code:
  - save a sentinel (= magic word) in a .noinit RAM location
  - generate a Watchdog reset
  - early in initialization code, check sentinel and jump to bootloader

Implementation notes:
  - heavily optimized for size not speed
  - USB registers accessed via indirect addressing: LDD Y+d, STD Y+d
  - many branches fall-through instead of jumping
  - no interrupts (a vector table takes too much space)
  - zero register is moved to YH vs. gcc's usual r1
*/
#define __SFR_OFFSET 0
#include <avr/io.h>

// USB module I/O uses indirect addressing to save program words. The Y register
// is used for this and always points to base of the USB module address space.

#define YBASE 0xD7

.macro sty a,b
	std	y+\a-YBASE,\b
.endm
.macro ldy b,a
	ldd	\b,y+\a-YBASE
.endm

// avr-libc 2.0.0 is missing USBRF definition for atmega32u4
#ifndef USBRF
# define USBRF 5
#endif

// USB commands
//
// The SETUP handling code exploits the fact that for all requests the upper
// four bits of bRequest are zero and the lower four bits of bmRequestType are
// also zero. So the two values can be bit-wise OR'ed and compared at the same
// time against a single command value. This saves space vs. separate compares.
//
// Note that this requires all requests to be DEVICE requests because the lower
// four bits of bmRequestType are used to identify the interface or endpoint for
// other transaction types.
#define CMD_GET_SIGRD   (0xC0 | 1)
#define CMD_GET_PMEM    (0xC0 | 2)
#define CMD_SET_PMEM    (0x40 | 3)
#define CMD_REBOOT      (0x40 | 4)
#define CMD_GET_EEPROM  (0xC0 | 5)
#define CMD_SET_EEPROM  (0x40 | 6)
#define CMD_GET_LOCK    (0xC0 | 7)


// USB control endpoint state machine values
//
// Each state reacts to a subset of UEINTX bits.
//
//   State      description and active UEINTX bits
//   -----      ---------------------------
//   SETUP      waiting for SETUP token
//   WR_DATA    waiting for OUT tokens for host-to-device DATA stage
//                 RXOUTI -> handle data from host
//                 NAKINI -> start STATUS stage (new state = WR_STATUS)
//   WR_STATUS  waiting for IN token for host-to-device STATUS data
//                 TXINI  -> transaction done (new state = SETUP)
//   RD_DATA    waiting for IN/OUT token
//                 TXINI  -> buffer DATA stage to send to host
//                           sets state = SETUP when all bytes sent
//
// RXSTPI (not shown above) resets the state machine in all states.
// Other UEINTX bits not listed for a given state are ignored in that state.
//
// The main loop processes UEINTX bits with this equivalent C code:
//
//   uint8_t intx = UEINTX;
//   intx &= state;              // clear all inactive UEINTX bits
//   intx = intx & -intx;        // find lowest asserted UEINTX bit
//   intx = ~intx;               // complement the result for reset
//   uint8_t ms = state & intx;  // selector value (see below)
//   switch (ms) {
//     ...
//     // maybe update intx
//     ...
//   }
//   UEINTX = intx;              // reset handled UEINTX bit(s)
//
// When some active UEINTX bit is asserted, the selector value (ms) equals the
// state value with the lowest asserted UEINTX bit **CLEARED**.
//
// For example, if ms == WR_DATA & ~_BV(RXOUTI) then RXOUTI has been triggered
// in state WR_DATA.
//
// This uniquely identifies both state and each possible asserted UEINTX bit if
// every state value is at least Hamming distance 2 from all other state values.
//
// To meet this criteria some extra bits must be added; any unused bits suffice.
// Below the state values use STALLEDI and FIFOCON for this purpose.
//
// These extra bit values have no effect because the corresponding selector
// values are never tested in the main loop.
//
// RXSTPI is handled separately and does not appear in any of the state values.
//
// The SETUP psuedo-state is zero since no UEINTX bits are active in that state.
//
// Summary:
//   WR_DATA   = 01000001b
//   WR_STATUS = 00000011b
//   RD_DATA   = 10000001b
//
// See also ch. 22 of the datasheet.

#define WR_DATA    (1<<RXOUTI|1<<NAKINI)
#define WR_STATUS  (1<<TXINI|1<<STALLEDI)
#define RD_DATA    (1<<TXINI|1<<FIFOCON)

#if defined(__AVR_AT90USB82__)
# define idProduct 0x2FF7
#elif defined(__AVR_AT90USB162__)
# define idProduct 0x2FFA
#elif defined(__AVR_ATmega8U2__)
# define idProduct 0x2FEE
#elif defined(__AVR_ATmega16U2__)
# define idProduct 0x2FEF
#elif defined(__AVR_ATmega32U2__)
# define idProduct 0x2FF0
#elif defined(__AVR_ATmega16U4__)
# define idProduct 0x2FF3
#elif defined(__AVR_ATmega32U4__)
# define idProduct 0x2FF4
#else
# error "Not-yet-inserted product ID for boot loader"
#endif

// Program entry

// Save and clear MCUSR
// WDRF must be cleared to disable the watchdog timer.
bootspace:
	// Original MCUSR is preserved in r2 for user program
	in	r2,MCUSR
	clr	YH
	out	MCUSR,YH
	clr	r18
	rcall	set_wdt	// Disable watchdog

// Busy loop pause for USB detach during reset.
// This ensures that the host detects detach before restart. Typically the
// oscillator/PLL startup delays will exceed the specified USB max detach
// detection timing (2.5 us), but this is here anyway for robustness.

1:	dec	r18	// 0 on entry
	brne	1b	// loops 256 times * 3 cycles = 768 cycles

// Jump to user code if reset was:
// - brown-out
// - watchdog (except if external reset also set)
// - power-on
// - USB reset
//
// Watchdog + external reset triggers the bootloader in case WDTON is set
// since the watchdog may fire while the reset button is being held down.
//
// User code can enter the bootloader by triggering any other reset,
// or by following the instructions at the top of this file.
	sbrc	r2,EXTRF	// Reset pin?
	 rjmp	1f		// Stay in boot loader
	tst	r2		// No reset source, so direct jump?
	breq	1f		// Stay in boot loader
	rjmp	wrap_appspace	// jump to user program

// Enable watchdog for bootloader, 16 ms timeout
1:	ldi	r18,1<<WDE
	rcall	set_wdt

// Hardware initialization

// PLL initialization
// PDIV3:0 = 0100 (equals reset value)
#if (F_CPU == 8000000)
	ldi	r16,1<<PLLE
#else
# ifndef PINDIV			// AT90USBà, àU2
#  define PINDIV PLLP0
# endif
	ldi	r16,1<<PINDIV|1<<PLLE
#endif
	out	PLLCSR,r16
1:	in	r0,PLLCSR	// loop_until_bit_is_set(PLLCSR, PLOCK)
	sbrs	r0,PLOCK
	 rjmp	1b
// Setup Y register for indirect addressing, YH is already zero
	ldi	YL,YBASE
// USB initialization
#ifdef UHWCON			// àU4
	ldi	r16,1<<UVREGE
	sty	UHWCON,r16	// set UVREGE
	// The first store does not set OTGPADE because clock is not enabled.
	// Using the same value for both stores saves program space.
	ldi	r16,1<<USBE|1<<OTGPADE
#else				// AT90USBà,àU2
	ldi	r16,1<<USBE
#endif
	sty	USBCON,r16	// set USBE / must be written twice on AT90USBà
	sty	USBCON,r16	// set OTGPADE
	sty	UDCON,YH	// set DETACH=0

// Main loop
// Exits only by watchdog reset triggered by REBOOT command.

	// Register assignments in all states:
	//   r2        cmd      see above
	//   r3        state    see above
	//   X   len      length of current transaction
	//   Y   YBASE    Y-register always equals YBASE
	//   Z   ptr      memory pointer (varies by command type)
	// Loop entry and initialization
	// state = SETUP
	clr	r3
	// Main loop body
loop:
	wdr	// Clear watchdog
// Check for USB reset
	// if (UDINT & _BV(EORSTI)) {
	ldy	r0,UDINT
	sbrs	r0,EORSTI
	 rjmp	intx

	// reset USB module and setup endpoint
	ldi	r24,1<<EPEN
	sty	UECONX,r24	// UECONX = EPEN;
	
	ldi	r24,1<<EPSIZE1|1<<EPSIZE0|1<<ALLOC
	sty	UECFG1X,r24	// setup EP0 for 64 byte FIFO size, one bank
	sty	UDINT,YH	// clear interrupts
	sty	UEINTX,YH

	// Endpoint handling
	// Register assignment:
	//   r16 = intx (clobbered by SETUP handling)
intx:
	// Check for USB endpoint events
	ldy	r16,UEINTX
	// Check for SETUP token
	sbrs	r16, RXSTPI
	 rjmp	handle_state	// if not got SETUP token
	// Handle SETUP token
	// The 8 byte SETUP token is copied into r2 through r9.
	//    r2 = bmRequestType
	//    r3 = bRequest
	//    r4 = wValueL
	//    r5 = wValueH
	//    r6 = wIndexL
	//    r7 = wIndexH
	//    r8 = wLengthL
	//    r9 = wLengthH
	// r24 = command
	// r25 = state
	// These are copied to r2:r3 upon setup completion
	ldi	ZL,2		// copy 8 bytes from UEDATX to r2:r9
	ldi	ZH,0		// ptr = 0x0002 (r2 in data space)
1:	ldy	r0,UEDATX
	st	Z+,r0
	cpi	ZL,10		// 8 bytes
	brne	1b
	sty	UEINTX,YH	// clear interrupts
	// Parse setup packet
	// STALL if bmRequestType has any bit 0-5 set
	//       or bRequest      has any bit 4-7 set
	// The command value is the bit-wise OR of these two values where
	//   bits 0-3 = bits 0-3 of bRequest
	//   bits 4-5 = 0
	//   bits 6-7 = bits 6-7 of bmRequestType
	movw	r24, r2
	andi	r24, 0x3f	// if (bmRequestType & 0x3f) goto stall
	brne	stall
	andi	r25, 0xf0	// if (bRequest & 0xf0) goto stall
	brne	stall
	movw	ZL,r4	// Most commands want Z = wValue, so setup memory pointer here
	// See USB commands above
	or	r2,r3		// cmd = bmRequestType | bRequest
	mov	r24,r2
	// Descriptor handling
	cpi	r24,0x86
	brne	setup_set_addr
	ldi	ZL,lo8(dev_desc)
	ldi	ZH,hi8(dev_desc)	// all descriptors share the same high address
	dec	r5	// wValueH = descriptor type
	breq	2f	// device descriptor (wValueH==1)
	ldi	ZL,lo8(conf_desc)
	ldi	XL,9+9	// sizeof(conf_desc)
	dec	r5
	breq	3f	// configuration descriptor (wValueH==2)
	dec	r5
	brne	stall	// no string descriptor (wValueH==3)
	ldi	ZL,lo8(string_desc0)
	tst	r4	// wValueL = string ID
	breq	2f	// deliver language list when zero
	ldi	ZL,lo8(string_desc1)	// otherwise, deliver "ubaboot" string
	// Descriptor reads can be short because host may read a prefix of either
	// descriptor during enumeration (e.g. for bMaxPacketSize0)
2:	lpm	XL,Z	// get length from descriptor start
3:	cp	r8,XL	// if (wLength < len) {
	cpc	r9,YH
	brcc	2f

	// Set cmd = 0x86 for program memory reads
	// which are implemented exactly the same as descriptor reads
setup_get_pmem_done:
	ldi	r24,0x86
	// Common SETUP token finalization
	// For non-descriptor control reads the host must always request
	// exact correct length or buffer overrun error occurs.
setup_done:
	//   len = wLength;
	movw	XL,r8
2:
	// state = cmd & 0x80 ? RD_DATA : WR_DATA
	ldi	r25,WR_DATA
	sbrc	r2,7
	 ldi	r25,RD_DATA
	// cmd = r24
	// state = r25
	movw	r2, r24
	rjmp	loop

stall:	// Bad request: STALL endpoint
	ldi	r24,1<<STALLRQ|1<<EPEN
	sty	UECONX,r24
	clr	r3
	rjmp	loop

	// The following commands are no-ops during setup
	// See state machine handling below for specific behavior

setup_set_addr:
	cpi	r24,0x05
	breq	setup_done
	cpi	r24,0x09
	breq	setup_done
	cpi	r24,CMD_REBOOT
	breq	setup_done
	cpi	r24,CMD_GET_EEPROM
	breq	setup_done
	cpi	r24,CMD_SET_EEPROM
	breq	setup_done
	// Signature read
	// This reads the bytes directly into UEDATX during setup
	// The state machine read loop is not used
	cpi	r24,CMD_GET_SIGRD
	brne	setup_get_lock
	// read signature row via SIGRD bit
	//   0000 = signature[0]
	//   0002 = signature[1]
	//   0004 = signature[2]
	ldi	r16,1<<SIGRD|1<<SPMEN
	ldi	r17,2
	ldi	r18,6
	// fall-through to setup_rd_spm

	// read special bytes through SPMCSR/LPM
	// r16 = SPMCSR value
	// r17 = lo(Z) step
	// r18 = lo(Z) limit
setup_rd_spm:
	ldi	ZL,0
	ldi	ZH,0
1:	//   UEDATX = load byte from special SPM row
	out	SPMCSR,r16
	lpm	r0,Z
	sty	UEDATX,r0
	add	ZL,r17
	cp	ZL,r18
	brne	1b
	rjmp	setup_done

	// Lock/fuse read
	// This also reads directly into UEDATX during setup
	// The state machine read loop is not used

setup_get_lock:
	cpi	r24, CMD_GET_LOCK
	brne	setup_get_pmem
	// read fuse/lock bytes via BLBSET bit
	//   0000 = low fuse
	//   0001 = lock byte
	//   0002 = ext fuse
	//   0003 = high fuse
	ldi	r16,1<<BLBSET|1<<SPMEN
	ldi	r17,1
	ldi	r18,4
	rjmp	setup_rd_spm

	// Read from program memory
	// Nothing to do except set 0x86

setup_get_pmem:
	cpi	r24, CMD_GET_PMEM
	breq	setup_get_pmem_done

	// Write to program memory
	// Enforces that pointer/length are page-aligned
	// And decrements pointer by one page (see state machine below)

	cpi	r24,CMD_SET_PMEM
	brne	stall
	mov	r17,XL		// length or address must not have lower bits set
	or	r17,ZL
	andi	r17,0x7f
	brne	stall
	rcall	do_spm_rwwsre	// reset temporary page
	rjmp	setup_done

	// Endpoint state machine handling
	//
	// Each loop iteration handles at most one UEINTX bit.
	// See USB control endpoint state machine above.
	//
	// At the very end this stores intx to UEINTX to clear handled bits
	// The endpoint handling code may clear bits in intx as required

handle_state:
	// Compute next bit to process
	// Note that when state == SETUP the following code does nothing
	// because r17 = 0xff and r25 == 0
	// so this falls-through to set UEINTX = 0xff which has no effect

	// intx &= state
	and	r16, r3
	// intx = ~(intx & -intx)
	mov	r17, r16
	neg	r17
	and	r16, r17
	com	r16

	// r24 = cmd
	// r25 = intx & state
	movw	r24, r2
	and	r25, r16

	// Control read TXINI: write data for host to UEINTX
	// Common loop contains command-specific handling
	// Implements flash/eeprom memory reads
	// Other reads already filled UEDATX and this is a no-op
	cpi	r25, RD_DATA & ~_BV(TXINI)
	brne	3f		// not ready to put data to USB
	ldi	r17,64		// EP0 FIFO size
1:	adiw	XL,0
	breq	state_end	// nothing to put
	// Flash memory reads, same as descriptor reads
	cpi	r24,0x86
	brne	2f
	lpm	r0,Z+
	sty	UEDATX,r0
2:	// EEPROM memory reads
	cpi	r24,CMD_GET_EEPROM
	brne	2f
	rcall	readEE
	sty	UEDATX,r0
2:	sbiw	XL,1
	dec	r17
	brne	1b
	// Control write RXOUTI: handle data from host in UEDATX
	// Unlike reads no common outer loop; each command implements its own.
	// Implements flash/eeprom memory writes
3:	cpi	r25, WR_DATA & ~_BV(RXOUTI)
	brne	wr_status_begin	// not got USB data
	// nb = UEBCLX
	ldy	r18,UEBCLX	// number of bytes in FIFO (0..64)
	// Flash memory writes
	// The temporary buffer is filled from the payload one *word* at a time.

	// Writes are always a multiple of page size and aligned to page boundaries.
	// Each OUT token comprises one half of the page temporary buffer.
	// The page is erased and written after every second token.
	cpi	r24,CMD_SET_PMEM
	brne	2f
1:	subi	r18,2
	brcs	1f
	ldy	r0,UEDATX
	ldy	r1,UEDATX
	ldi	r19,1<<SPMEN
	rcall	do_spm		// fill temporary buffer
	adiw	ZL,2
	rjmp	1b
1:	// Erase and write the page if buffer filled.
	// Z points BEYOND current page
	mov	r18,ZL
	andi	r18,0x7f	// if (lo(ptr) & 0x7f == 0) {
	brne	mask_intx
	sbiw	ZL,2		// go back to current page (PCWORD bits are ignored)
	ldi	r19,1<<PGERS|1<<SPMEN
	rcall	do_spm
	ldi	r19,1<<PGWRT|1<<SPMEN
	rcall	do_spm
	adiw	ZL,2		// go to next page
	rcall	do_spm_rwwsre	// re-enable read-while-write section
2:	cpi	r24,CMD_SET_EEPROM
	brne	mask_intx
	// EEPROM memory writes.
	// The hardware allows atomic byte-wise erase+write so this is easy.
	// Loops over the token payload writing each byte.
1:	subi	r18,1
	brcs	mask_intx
	rcall	readEE		// wait and read value at address Z
	ldy	r1,UEDATX
	cp	r0,r1
	breq	1b		// don't change (faster)
	out	EEDR,r1
	sbi	EECR,EEMPE
	sbi	EECR,EEPE	// write changed byte, don't wait for completion here
	rjmp	1b

	// Control write NAKINI: write finished
wr_status_begin:
	cpi	r25,WR_DATA & ~_BV(NAKINI)
	brne	wr_status_end
	andi	r16,~_BV(TXINI)
	ldi	r25,WR_STATUS
	mov	r3,r25

wr_status_end:
	cpi	r25,WR_STATUS & ~_BV(TXINI)
	brne	mask_intx
	cpi	r24,0x05	// Set address
	brne	do_reboot
	sty	UDADDR,ZL
	ori	ZL,0x80
	sty	UDADDR,ZL
// Reboot to user code
do_reboot:
	cpi	r24,CMD_REBOOT
	breq	.-2		// stay here until watchdog reset
state_end:
	clr	r3
mask_intx:
	sty	UEINTX,r16
	rjmp	loop

// EEPROM read, increment Z pointer
// inputs:   Z = address
// outputs:  Z = address + 1, r0 = byte
// clobbers: none
1:	wdr
readEE:	sbic	EECR,EEPE	// Wait until EEPROM ready
	 rjmp	1b
	out	EEARL,ZL
	out	EEARH,ZH
	adiw	ZL,1
	sbi	EECR,EERE
	in	r0,EEDR
	ret

// Watchdog setup
// inputs:   r18 = new WDTCSR
// clobbers: r19
set_wdt:
	ldi	r19,1<<WDCE|1<<WDE
	sts	WDTCSR,r19
	sts	WDTCSR,r18
	ret

#define W(x) (x)&0xFF,(x)>>8	// define unaligned 16-bit LSBfirst quantities
.type dev_desc,@common		// don't disassemble
dev_desc:			// device descriptor
	.byte	18		//bLength
	.byte	1		//bDescriptorType = Device
	.byte	W(0x0200)	//bcdUSB
	.byte	0xFF		//bDeviceClass		(libusb)
	.byte	1		//bDeviceSubClass
	.byte	0		//bDeviceProtocol
	.byte	64		//bMaxPacketSize0
	.byte	W(0x03EB)	//idVendor = Atmel
	.byte	W(idProduct)	//idProduct = AVR DFU
	.byte	W(0x2310)	//bcdDevice (Year/Month)
	.byte	1		//iManufacturer
	.byte	1		//iProduct
	.byte	0		//iSerialNumber
	.byte	1		//bNumConfigurations

.type conf_desc,@common		// don't disassemble
conf_desc:			// configuration + interface (+ endpoint) descriptors
	.byte	9		//bLength
	.byte	2		//bDescriptorType = Configuration,
	.byte	W(9+9)		//wTotalLength
	.byte	1		//bNumInterfaces
	.byte	1		//bConfigurationValue
	.byte	0		//iConfiguration
	.byte	0x80		//bmAttributes
	.byte	50		//MaxPower: 100mA

	.byte	9		//bLength
	.byte	4		//bDescriptorType = Interface
	.byte	0		//bInterfaceNumber
	.byte	0		//bAlternateSetting
	.byte	0		//bNumEndpoints
	.byte	0		//bInterfaceClass
	.byte	0		//bInterfaceSubClass
	.byte	0		//bInterfaceProtocol
	.byte	0		//iInterface
.type string_desc0,@common	// don't disassemble
string_desc0:
	.byte	4,3,W(0x0409)	//language(english)
.type string_desc1,@common	// don't disassemble
string_desc1:
	.byte	22,3		//L"ubaboot'23"
	.byte	'u',0,'b',0,'a',0,'b',0,'o',0,'o',0,'t',0,'\'',0,'2',0,'3',0
#undef W

.org	0x1EE,0xFF
app_spm:	// on byte end address -18
	rcall	do_spm
// Re-enable read-while-write section, also resets temporary page
do_spm_rwwsre:
	ldi	r19,1<<RWWSRE | 1<<SPMEN
// intentional fall-through to do_spm

// SPM subroutine
// inputs:   r19 = spmctrl = SPMCSR value
// clobbers: r0
do_spm:
	out	SPMCSR,r19
	spm
1:	wdr
	in	r0,SPMCSR
	sbrc	r0,SPMEN
	 rjmp	1b
	ret
wrap_appspace:	// on byte end address
Detected encoding: OEM (CP437)1
Wrong umlauts? - Assume file is ANSI (CP1252) encoded