Integrating with ml64.exe

I’m back to Windows.
First, I started with writing simple functions in 32 and 64bit assembly. Visual Studio compiler supports 32bit inline assembly, but doesn’t support 64bit. Today, I used ml.exe/ml64.exe for both.

Here is the caller in C.

// asm.cpp : Defines the entry point for the console application.
//

#include "stdafx.h"

#ifdef __cplusplus
extern "C" {
#endif
  char* _message();
  int _add(int a, int b);
  int _sub(int a, int b);
  int _fib(int a);
#ifdef __cplusplus
}
#endif

int _tmain(int argc, _TCHAR* argv[])
{
  int a = 10;
  int b = 20;

  printf("* started\n");
  int result = _add(a, b);

  printf("%d+%d=%d\n", a, b, result);
  result = _sub(a, b);
  printf("%d-%d=%d\n", a, b, result);

  for (int i = 0; i < 20; i++){
    result = _fib(i);
    printf("fib(%d)=%d\n", i, result);
  }
  printf("%s\n", _message());

  printf("* done\n");
  return 0;
}

That calls these.
asm64.asm:

.data
MSG	DB  "Hoge x64,", 0dh, 0ah, "Page!", 0dh, 0ah, 0

.code
; char* _message()
_message PROC
	mov rax, offset MSG
	ret
_message ENDP

; int _add(int a, int b)
_add PROC
add rcx, rdx			
mov rax, rcx
ret
_add ENDP

; int _sub(int a, int b)
_sub PROC
sub rcx, rdx			
mov rax, rcx
ret
_sub ENDP

; int _fib(int a)
_fib PROC
	cmp rcx, 0
	jz @ret0
	cmp rcx, 1
	jz @ret1

	push r12
	push r13

	mov r12, rcx
	sub r12, 1
	mov rcx, r12
	call _fib
	mov r13, rax

	sub r12, 1h
	mov rcx, r12
	call _fib
	add r13, rax

	mov rax, r13
	pop r13
	pop r12
	ret
@ret0:
	mov rax, 0
ret
@ret1:
	mov rax, 1
ret
_fib ENDP

END

asm32.asm:

.data
MSG	DB  "Hoge x64,", 0dh, 0ah, "Page!", 0dh, 0ah, 0

.code
; char* _message()
_message PROC
	mov rax, offset MSG
	ret
_message ENDP

; int _add(int a, int b)
_add PROC
add rcx, rdx			
mov rax, rcx
ret
_add ENDP

; int _sub(int a, int b)
_sub PROC
sub rcx, rdx			
mov rax, rcx
ret
_sub ENDP

; int _fib(int a)
_fib PROC
	cmp rcx, 0
	jz @ret0
	cmp rcx, 1
	jz @ret1

	push r12
	push r13

	mov r12, rcx
	sub r12, 1
	mov rcx, r12
	call _fib
	mov r13, rax

	sub r12, 1h
	mov rcx, r12
	call _fib
	add r13, rax

	mov rax, r13
	pop r13
	pop r12
	ret
@ret0:
	mov rax, 0
ret
@ret1:
	mov rax, 1
ret
_fib ENDP

END

Configure vcproj to run ml/ml64 using "custom build as below".
asm64_config

64 bit result:

C:\Users\sokoide\Projects\Spike\x64\Debug\asm.exe
* started
10+20=30
10-20=-10
fib(0)=0
fib(1)=1
fib(2)=1
fib(3)=2
fib(4)=3
fib(5)=5
fib(6)=8
fib(7)=13
fib(8)=21
fib(9)=34
fib(10)=55
fib(11)=89
fib(12)=144
fib(13)=233
fib(14)=377
fib(15)=610
fib(16)=987
fib(17)=1597
fib(18)=2584
fib(19)=4181
Hoge x64,
Page!

* done

32 bit result:

>C:\Users\sokoide\Projects\Spike\Debug\asm.exe
* started
10+20=30
10-20=-10
fib(0)=0
fib(1)=1
fib(2)=1
fib(3)=2
fib(4)=3
fib(5)=5
fib(6)=8
fib(7)=13
fib(8)=21
fib(9)=34
fib(10)=55
fib(11)=89
fib(12)=144
fib(13)=233
fib(14)=377
fib(15)=610
fib(16)=987
fib(17)=1597
fib(18)=2584
fib(19)=4181
Hoge x86,
Page!

* done

Preemptive multi-tasking

I changed the previous cooperative threads into preemptive threads.
It changes contexts during timer interrupts every 16ms with non-inteligent round robin scheduling.
I noticed that ARM swaps R13/R14 during IRQ, and needed to go back and force between IRQ and SVC mode to get the original (banked) registers as below. When IRQ_hander C function returns non null, it’ll switch contexts.

_IRQ_iterrupt:
  //-- irq mode
  sub lr, lr, #4 // in IRQ mode, r14_irq(lr_irq) points to PC+#4 in user mode
  // save context
  push {r0-r12, lr} // save user mode registers

  mrs r0, spsr // spsr -> r0
  cps #0x13
  //-- svc mode
  mov r1, sp
  mov r2, lr

  cps #0x12
  //-- irq mode
  push {r0-r2} // save spsr, user mode sp, lr

  // call IRQ_hander(user-mode-sp)
  mov r0, r2
	bl	IRQ_handler
  cmp r0, #0
  bne _IRQ_interrupt_context_switch

  pop {r0-r2} // restore spsr, user mode sp, lr
  msr spsr, r0 // r0 -> spsr
  cps #0x13
  //-- svc mode
  mov sp, r1 // restore sp
  mov lr, r2 // restore lr

  cps #0x12
  //-- irq mode
  pop  {r0-r12,lr}
  movs pc, lr

_IRQ_interrupt_context_switch:
  //-- irq mode
  // r0 is next thread's SP

  pop {r1-r3} // restore spsr, user mode sp, lr

  // save registers in user mode stack
  // r1: user mode cpsr
  // r2: user mode sp
  // r3: user mode lr
  sub r2, r2, #4
  str r1, [r2] // spsr

  ldr r4, [r13, #4*13]
  sub r2, r2, #4
  str r4, [r2] // user mode pc (r14_irq)

  sub r2, r2, #4
  str r3, [r2] // user mode lr

  ldr r4, [r13, #4*12]
  sub r2, r2, #4
  str r4, [r2] // user mode r12

  ldr r4, [r13, #4*11]
  sub r2, r2, #4
  str r4, [r2] // user mode r11

  ldr r4, [r13, #4*10]
  sub r2, r2, #4
  str r4, [r2] // user mode r10

  ldr r4, [r13, #4*9]
  sub r2, r2, #4
  str r4, [r2] // user mode r9

  ldr r4, [r13, #4*8]
  sub r2, r2, #4
  str r4, [r2] // user mode r8

  ldr r4, [r13, #4*7]
  sub r2, r2, #4
  str r4, [r2] // user mode r7

  ldr r4, [r13, #4*6]
  sub r2, r2, #4
  str r4, [r2] // user mode r6

  ldr r4, [r13, #4*5]
  sub r2, r2, #4
  str r4, [r2] // user mode r5

  ldr r4, [r13, #4*4]
  sub r2, r2, #4
  str r4, [r2] // user mode r4

  ldr r4, [r13, #4*3]
  sub r2, r2, #4
  str r4, [r2] // user mode r3

  ldr r4, [r13, #4*2]
  sub r2, r2, #4
  str r4, [r2] // user mode r2

  ldr r4, [r13, #4*1]
  sub r2, r2, #4
  str r4, [r2] // user mode r1

  ldr r4, [r13]
  sub r2, r2, #4
  str r4, [r2] // user mode r0

  mov r4, sp // r4 <- r13_irq

  msr spsr, r1 // r1 -> spsr
  cps #0x13          //@ svc mode
  //-- svc mode
  // change user mode stack to next thread's stack
  mov sp, r0
  // push into r13_irq (r4)
  ldr r2, [sp, #4*14] // user mode pc
  sub r4, r4, #4
  str r2, [r4]
  ldr r2, [sp, #4*15] // spsr
  sub r4, r4, #4
  str r2, [r4]
  // restore registers
  pop {r0-r12,lr}
  add sp, sp, #4*2 // pop pc, spsr

  cps #0x12          //@ irq mode
  //-- irq mode
  sub sp, sp, #4*2
  pop {lr}
  // enable irq and restore spsr
  bic lr, lr, #0x80
  msr spsr, lr // lr -> spsr
  // restore lr (user mode pc)
  pop {lr}
  // discard pushed registers
  add r13, r13, #4*14
  // movs pc, * ... mov's' pc restores status register
  movs pc, lr

I also tried to implement and add critical section but the one in ARM reference’s hanged probably because I haven’t setup MMU.

// http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dht0008a/ch01s03s02.html
.equ  locked,   1
.equ  unlocked, 0

// BUG: lock_mutex hangs
// LDREX doesn't work when MMU is disabled. Don't use this.
// Declare for use from C as extern void lock_mutex(void * mutex);
.global _lock_mutex_mmu
_lock_mutex_mmu:
  LDR     r1, =locked
1:
  LDREX   r2, [r0]
  CMP     r2, r1        // Test if mutex is locked or unlocked
  BEQ     2f
  STREXNE r2, r1, [r0]  // Not locked, attempt to lock it
  CMPNE   r2, #1        // Check if Store-Exclusive failed
  BEQ     1b           // Failed - retry from 1
  // Lock acquired
  DMB                   // Required before accessing protected resource
  BX      lr
2:
// Take appropriate action while waiting for mutex to become unlocked
  //wfi
  nop
  B       1b           // Retry from 1


// BUG: unlock_mutex
// Declare for use from C as extern void unlock_mutex(void * mutex);
.global _unlock_mutex_mmu
_unlock_mutex_mmu:
    LDR     r1, =unlocked
    DMB                   // Required before releasing protected resource
    STR     r1, [r0]      // Unlock mutex
    // SIGNAL_UPDATE: none
    BX      lr


// BUG: SWP version
// http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dht0008a/CJHBGBBJ.html
// still hangs
.global _lock_mutex_swp
_lock_mutex_swp:
    LDR r2, =locked
    SWP r1, r2, [r0]       // Swap R2 with location [R0], [R0] value placed in R1
    CMP r1, r2             // Check if memory value was ‘locked’
    BEQ _lock_mutex_swp     // If so, retry immediately
    BX  lr                 // If not, lock successful, return

// BUG: not really excusive when context swithes after ldr befor str
.global _lock_mutex_simple
_lock_mutex_simple:
  ldr r1, =unlocked
  ldr r3, =locked
  ldr r2, [r0]
  cmp r2, r3
  beq _lock_mutex_simple
  str r1, [r0]
  bx lr

.global _unlock_mutex_simple
_unlock_mutex_simple:
    LDR r1, =unlocked
    STR r1, [r0]           // Write value ‘unlocked’ to location [R0]
    BX  lr

Today’s code -> https://github.com/sokoide/rpi-baremetal -> 009_context_switch2

Learning ARM assembly basics

Before writing more ARM code, I learned it in Internet.

STMFD = store multiple registers full descendent = stmdb (decrement before)
LDMFD = load multiple registers full descendent = ldmia (increment after)
if ! is specified, the result will be written back.

STMFD:

pre:
r1 = 0x1
r2 = 0x2
r13 = 0x00008000

stmfd !r13, {r1, r2}

post:
r1 = 0x1
r2 = 0x2
r13 = 0x00007ff8
mem[0x00007ff8] = 0x00000001
mem[0x00008000] = 0x00000002

LDMFD:

pre:
r1 = 0x0
r2 = 0x0
r13 = 0x00008000
mem[0x00008000] = 0x00000001
mem[0x00008004] = 0x00000002

ldmfd !r13, {r1, r2}

post:
r1 = 0x00000001
r2 = 0x00000002
r13 = 0x0000800c

And some more info.
R0-R3: scratch registers, don’t need to save
R4-R12: callee saved
R13=SP (Stack Pointer) stack
R14=LR (Link Register) to store return address of function call
R15=PC (Program Counter) instruction pointer

If you want to do like this x86,

push 1
push 2
call hoge
// result in EAX

It’ll be like this in ARM.

mov r0, #1
mov r1, #2
bl hoge // set LR=next instruction pointer and branch to hoge
// result in r0

startup.s:

@ startup
  .align

.global _start
_start:
  ldr r0, =0x000000d3
  msr cpsr, r0
  ldr sp, =0x06400000
  bl main
  b .

.global _add
_add:
  add r0, r0, r1
  mov pc, lr

.global _sub
_sub:
  sub r0, r0, r1
  mov pc, lr

.global _fib
_fib:
  stmfd r13!,{r8, r9, lr} // same as stmdb (decrement before) or push, ! means write back in r13

  // return 0 if fib(0)
  mov r8, #0
  cmp r0, #0
  beq _fib_end

  // return 1 if fib(1)
  mov r8, #1
  cmp r0, #2
  ble _fib_end

  // store arg-1 and arg-2 in r0 and r9
  sub r0, r0, #1 // arg-1
  sub r9, r0, #1 // arg-2

  // call fib(arg-1)
  mov r8, #0
  bl _fib
  add r8, r0, r8

  // call fib(arg-2)
  mov r0, r9
  bl _fib
  add r8, r0, r8

_fib_end:
  mov r0, r8
  ldmfd r13!,{r8, r9, lr} // same as ldmia (increment after) or pop, ! means write back in r13
  mov pc, lr

hoge.c:

static const int kA = 3;
static const int kB = 4;

int _add(int a, int b);
int _sub(int a, int b);
int _fib(int a);

int main(int argc, char const* argv[]) {
  int a = kA;
  int b = kB;

  int result;
  result = _add(a, b);
  result = _sub(a, b);
  result = _fib(0);
  result = _fib(1);
  result = _fib(2);
  result = _fib(6);
  result = _fib(10);
}

And tested it with gdb+ARM sim.

$cat startup.gdb
target sim
file hoge
load hoge
b main
run

$arm-unknown-eabi-gdb --command=startup.gdb
...
15        result = _fib(0);
(gdb) n
16        result = _fib(1);
(gdb) p result
$1 = 0
(gdb) n
17        result = _fib(2);
(gdb) p result
$2 = 1
(gdb) n
18        result = _fib(6);
(gdb) p result
$3 = 1
(gdb) n
19        result = _fib(10);
(gdb) p result
$4 = 8
(gdb) n
20      }
(gdb) p result
$5 = 55

Thread and context switch

I implemented a simple context switch for ARM. First, I implemented a THREAD and THREADCTL as below.

*** rpi.h
#define MAX_THREADS 1024
void InitThread();
void CreateThread(void *thread_entry);
void ContextSwitch();

typedef enum {
  THREAD_NONE,
  THREAD_CREATED,
  THREAD_RUNNING,
  THREAD_WAITING
} THREAD_STATE;

typedef struct {
  THREAD_STATE state;
  int *stack;
  unsigned int stackSize;
} THREAD;

typedef struct {
  unsigned int currentId;
  unsigned int length;
  THREAD thread[MAX_THREADS];
} THREADCTL;

extern THREADCTL threadctl;

*** rpi-thread.c
THREADCTL threadctl;

void InitThread() {
  threadctl.length = 1;
  threadctl.thread[0].state = THREAD_RUNNING;
  threadctl.currentId = 0;

  threadctl.thread[0].stack = NULL;
  /* threadctl.thrad[0].stackSize = TBD; */
}

void CreateThread(void *thread_entry) {
  unsigned int id = threadctl.length;
  const unsigned int stackSize = 4096;  // 4096 * sizeof(int) allocated
  int *stackBase;

  threadctl.length++;
  threadctl.thread[id].state = THREAD_CREATED;
  stackBase = (int *)malloc(stackSize * sizeof(int));
  stackBase += stackSize - 1;
  threadctl.thread[id].stack = stackBase-13;
  threadctl.thread[id].stackSize = stackSize;

  // push default registers into the stack which will be poped in
  // _context_switch
  *threadctl.thread[id].stack =
      (int)thread_entry;  // r14: lr (to be stored in pc)
  threadctl.thread[id].stack--;
  *threadctl.thread[id].stack = 0;  // r12
  threadctl.thread[id].stack--;
  *threadctl.thread[id].stack = 0;  // r11
  threadctl.thread[id].stack--;
  *threadctl.thread[id].stack = 0;  // r10
  threadctl.thread[id].stack--;
  *threadctl.thread[id].stack = 0;  // r9
  threadctl.thread[id].stack--;
  *threadctl.thread[id].stack = 0;  // r8
  threadctl.thread[id].stack--;
  *threadctl.thread[id].stack = 0;  // r7
  threadctl.thread[id].stack--;
  *threadctl.thread[id].stack = 0;  // r6
  threadctl.thread[id].stack--;
  *threadctl.thread[id].stack = 0;  // r5
  threadctl.thread[id].stack--;
  *threadctl.thread[id].stack = 0;  // r4
  threadctl.thread[id].stack--;
  *threadctl.thread[id].stack = 0;  // r3
  threadctl.thread[id].stack--;
  *threadctl.thread[id].stack = 0;  // r2
  threadctl.thread[id].stack--;
  *threadctl.thread[id].stack = 0;  // r1
  threadctl.thread[id].stack--;
  *threadctl.thread[id].stack = 0;  // r0
  // don't do stack--!
}

void ContextSwitch() {
  if (threadctl.length <= 1) {
    return;
  }

  unsigned int currentId = threadctl.currentId;
  unsigned int nextId = currentId + 1;

  if (nextId >= threadctl.length) {
    nextId = 0;
  }

  char message[512];
  if (NULL != threadctl.thread[currentId].stack &&
      NULL != threadctl.thread[nextId].stack) {
    sprintf(message, "%d jumping from th:%d@%p to th:%d@%p", timerctl.counter,
            currentId, *threadctl.thread[currentId].stack, nextId,
            *threadctl.thread[nextId].stack);
  } else {
    sprintf(message, "%d jumping from th:%d to th:%d", timerctl.counter,
            currentId, nextId);
  }
  FillRect(0, 16, kWidth, 16, 0);
  PrintStr(0, 16, message, 7);

  // get current sp
  threadctl.thread[currentId].stack = (int *)_get_stack_pointer();

  // do context switch
  threadctl.thread[nextId].state = THREAD_RUNNING;
  threadctl.thread[currentId].state = THREAD_WAITING;
  threadctl.currentId = nextId;
  _context_switch(&threadctl.thread[currentId].stack,
                  &threadctl.thread[nextId].stack);
}

The _context_switch is written in assembly which pushes r0-14 in per-thread stack and save the stack pointer in the per-thread variable ‘stack’.

.global _get_stack_pointer
_get_stack_pointer:
  mov r0, r13
  bx lr
  
.global _context_switch
_context_switch:
  // same as stmfd/stmdb !r13, {...}
  push {r0-r12,r14}
  str sp, [r0]
  ldr sp, [r1]
  // same as ldmfd/ldmia !r13, {...}
  pop {r0-r12}
  pop {pc} // pc points to the previous lr

I use it in main function.

InitThread(); // add the current thread into THREADCTL
CreateThread(task_a); // create a new thread for task_a and start it
CreateThread(task_b); // create a new thread for task_b and start it
...
// switch contexts using timer interrupt
while (true) {
  _disable_IRQ();
  _wfi();
  if (StatusFifo8(&fifoTimer) == 0) {
    _enable_IRQ();
  } else {
    unsigned char data = GetFifo8(&fifoTimer);
    _enable_IRQ();
      
     switch (data) {
      case (const int)timerData1:
        counter1++;
        SetTimer(timer1, timerInterval1, timerData1);
        draw_counter(0, counter1);
        ContextSwitch();
        break;
    }
  }
}
  
void task_a() {
  unsigned int counter = 0;
  while (true) {
    draw_counter(1, counter++);
    ContextSwitch(); // for now, it's changing contexts by the function itself
  }
}

void task_b() {
  unsigned int counter = 0;
  while (true) {
    draw_counter(2, counter++);
    ContextSwitch(); // for now, it's changing contexts by the function itself
  }
}

Tested it and worked!
Today’s task_a/b have ContextSwitch() explicitly. I’ll make the context switch full automatic in the next post.

https://github.com/sokoide/rpi-baremetal -> 008_context_switch.

context_switch

Print string

I drew a string using hankaku.bin 8×16 font provided by the X86 OS book.
First, I converted the 4096 byte bin file into elf format.

hankaku.o: hankaku.bin                                                                                     
  $(OBJCOPY) -I binary -O elf32-littlearm -B arm $< $@                                                     

when you convert that way, you can refer to the address in .data section by _hankaku_bin_obj_start as below.

void printstr(int x, int y, char *str, char color) {
  size_t len = strlen(str);
  for (int i = 0; i < len; i++) {
    char c = str[i];
    myputchar(x + i * 8, y, c, color);
  }
}

void myputchar(int x, int y, char c, char color) {
  char *hankaku = (char *)&_binary_hankaku_bin_start;
  char *base = (char *)fbRequest.fbBaseAddress;
  char *p;
  char d;

  for (int i = 0; i < 16; i++) {
    p = base + (y + i) * kWidth + x;
    d = hankaku[c * 16 + i];
...                                                

And it's displayed by this!

  printstr(10, 0, "HOG", 7);
  printstr(10 + 8 * 3, 0, "E", 1);

Today's code: https://github.com/sokoide/rpi-baremetal -> 005_character.

hoge

Timer interrupt

Implemented timer. First, it needs interrupt vector table at address 0x00000000.
The following (not optimized my first) arm code “_init_vector_table” copies _initialize_vector_start to _initialize_vector_end to 0x00000000.

First, I tried to copy that in C, but uint32_t* vector_table; *vector_table=… was not compiled as I expected by cross gcc 4.9 perhaps because 0 is NULL in C and *NULL is illegal.

.global  _initialize_vector_start
_initialize_vector_start:
        ldr     pc, _vec_Reset
        ldr     pc, _vec_Undef
        ldr     pc, _vec_SWI
        ldr     pc, _vec_PrefAbort
        ldr     pc, _vec_DataAbort
        ldr     pc, _vec_Reserved
        ldr     pc, _vec_IRQ
        ldr     pc, _vec_FIQ
_vec_Reset:             .word   _start
_vec_Undef:             .word   _hangup
_vec_SWI:               .word   _hangup
_vec_PrefAbort: .word   _hangup
_vec_DataAbort: .word   _hangup
_vec_Reserved:  .word   _hangup
_vec_IRQ:               .word   _IRQ_iterrupt
_vec_FIQ:               .word   _hangup
        .global  _initialize_vector_end
_initialize_vector_end:
# dummy instruction to keep initialize_vector_end label
        mov r0,r0

.global _init_vector_table
_init_vector_table:
  ldr r0, =0x0
  ldr r1, =_initialize_vector_start
  ldr r3, =_initialize_vector_end
  cmp r1, r3
  bxeq lr
  _init_vector_table_loop:
  ldr r2, [r1]
  str r2, [r0]
  add r1, #4
  add r0, #4
  cmp r1, r3
  bne _init_vector_table_loop
  bx lr

Then, _IRQ_iterrupt calls C function IRQ_handler.

_IRQ_iterrupt:
	stmfd	r13!, {r0-r12,lr}
	bl	IRQ_handler
	ldmfd	r13!, {r0-r12,lr}
	subs	pc,lr, #4

IRQ_handler sets g_interrupt flag.

volatile static bool g_interrupt = false;

// called by _IRQ_interrupt in startup.s
void IRQ_handler(void) {
  // disable IRQ
  _disable_IRQ();

  if (*INTERRUPT_IRQ_BASIC_PENDING & 0x01 != 0) {
    // Timer interrupt handler
    g_interrupt = true;
    // clear interrupt flag
    *TIMER_IRQ_CLR = 0;
  }

  // enable IRQ
  _enable_IRQ();
}

Then dirty main loop is checking the flag. I’ll change it to wait for interrupt like another x86 OS is doing.

  while (true) {
    _disable_IRQ();
    if (true == g_interrupt) {
      g_interrupt = false;
      _enable_IRQ();
      redraw();
    } else {
      _enable_IRQ();
    }
  }

Today’s code: https://github.com/sokoide/rpi-baremetal -> 004_timer_interrupt.

Implementing syscalls for libc

If you try to use C-library functions such as malloc() or printf(), link fails with these since they depends on these system calls.

/home/sokoide/cross/rpi/arm-unknown-eabi/arm-unknown-eabi/lib//libc.a(lib_a-sbrkr.o): In function `_sbrk_r':
sbrkr.c:(.text+0x18): undefined reference to `_sbrk'
/home/sokoide/cross/rpi/arm-unknown-eabi/arm-unknown-eabi/lib//libc.a(lib_a-writer.o): In function `_write_r':
writer.c:(.text+0x20): undefined reference to `_write'
/home/sokoide/cross/rpi/arm-unknown-eabi/arm-unknown-eabi/lib//libc.a(lib_a-closer.o): In function `_close_r':
closer.c:(.text+0x18): undefined reference to `_close'
/home/sokoide/cross/rpi/arm-unknown-eabi/arm-unknown-eabi/lib//libc.a(lib_a-fstatr.o): In function `_fstat_r':
fstatr.c:(.text+0x1c): undefined reference to `_fstat'
/home/sokoide/cross/rpi/arm-unknown-eabi/arm-unknown-eabi/lib//libc.a(lib_a-isattyr.o): In function `_isatty_r':
isattyr.c:(.text+0x18): undefined reference to `_isatty'
/home/sokoide/cross/rpi/arm-unknown-eabi/arm-unknown-eabi/lib//libc.a(lib_a-lseekr.o): In function `_lseek_r':
lseekr.c:(.text+0x20): undefined reference to `_lseek'
/home/sokoide/cross/rpi/arm-unknown-eabi/arm-unknown-eabi/lib//libc.a(lib_a-readr.o): In function `_read_r':
readr.c:(.text+0x20): undefined reference to `_read'

We can’t use OS’s syscalls obviously since there is no OS. The book suggested to implement it by reffering to ~/cross/src/ct-ng_rpi/.build/src/newlib-1.20.0/newlib/libc/sys/arm/syscalls.c.

I implemented a stub and wrote _write as below. Since there is no stdout yet, I only copied it in memory and confirmed it in gdb + arm sim.

_write in syscalls.c:

int _write(int file, char *ptr, int len) {
  // TODO:
  for (int i = 0; i < len; i++) {
    _write_memory(ptr[i]);
  }
  return len;
}

_writememory in startup.s: only to confirm it's called with a right character

_write_memory:
  bx  lr

confirmation: tried to confirm with arm sim on gdb, and got this. It looks it's not implemented in arm sim.

(gdb) target sim
Connected to the simulator.
(gdb) load hoge.elf
Loading section .text, size 0x8cf0 vma 0x8000
Loading section .rodata, size 0xa48 vma 0x10cf0
Loading section .rodata.str1.4, size 0xc vma 0x11738
Loading section .rodata.str1.1, size 0x4b vma 0x11744
Loading section .ARM.exidx, size 0x8 vma 0x11790
Start address 0x8000
Transfer rate: 310456 bits in <1 sec.
(gdb) file hoge.elf
Reading symbols from hoge.elf...done.
(gdb) b _write_memory
Breakpoint 1 at 0x8020
(gdb) start
Temporary breakpoint 2 at 0x808c
Starting program: /media/psf/Dropbox/workspace/arm/rpi-baremetal/002/hoge.elf 

Temporary breakpoint 2, 0x0000808c in main ()
(gdb) c
Continuing.
Unhandled v6 thumb insn: 4601
[Inferior 1 (process 42000) exited with code 0215]

Uploaded above at https://github.com/sokoide/rpi-baremetal -> 002_libcstub.

First RPi2 bare metal code

First bare metal code for Raspberry Pi2 hardware uploaded at https://github.com/sokoide/rpi-baremetal/tree/master/001_led.
Used http://www.valvers.com/open-software/raspberry-pi/step02-bare-metal-programming-in-c-pt2/ for GPIO port definition for RPi2 and http://tatsu-zine.com/books/raspi-bm for section def & boot startup code.

When the hardware starts up, it halts CPU and starts GPU which loads bootloader from ROM which loads bootcode.bin which reads config.txt. Start.elf loads the boot image file written in config.txt at 0x00008000 and runs it.
When you build it, you’ll get hoge.img which is loaded and executed by start.elf during boot and see a blinking green LED at fast, regular, slow, regular, fast… paces.

Here is my first bare metal code which is only 480 bytes.

hoge.lds:

OUTPUT_ARCH(arm)
ENTRY(_start)
SECTIONS
{
	. = 0x8000;

	.text : { *(.text*) }
	. = ALIGN(4);

	__rodata_start = .;
	.rodata : { *(.data*) }
	. = ALIGN(4);
	__rodata_end = .;

	__data_start = . ;
	.data : { *(.data*) }
	. = ALIGN(4);
	__data_end = . ;

	__bss_start = . ;
	.bss : { *(.bss*) }
	. = ALIGN(4);
	__bss_end = . ;
}

startup.s: in supervisor mode (d3), using 1MB stack at 0x06400000 although I don’t need that much.

@ startup
  .global _start
  .align

_start:
  ldr r0, =0x000000d3
  msr cpsr, r0
  ldr sp, =0x06400000
  bl main
  b .

hoge.c:

#include "lib/rpi.h"
#define kFastInterval 200000
#define kRegularInterval 500000
#define kSlowInterval 1000000

volatile unsigned int* gpio = (unsigned int*)GPIO_BASE;
volatile unsigned int tim;

void wait(unsigned int interval);

void wait(unsigned int interval) {
  for (tim = 0; tim < interval; tim++)
    ;
}

void led_on() {
  /* Set the LED GPIO pin high ( Turn OK LED off for original Pi, and on
     for plus models )*/
  gpio[LED_GPSET] = (1 << LED_GPIO_BIT);
}

void led_off() {
  /* Set the LED GPIO pin low ( Turn OK LED on for original Pi, and off
     for plus models )*/
  gpio[LED_GPCLR] = (1 << LED_GPIO_BIT);
}

int main(int argc, char const* argv[]) {
  rpiInit();

  gpio[LED_GPFSEL] |= (1 << LED_GPFBIT);

  unsigned int intervals[] = {kFastInterval, kRegularInterval, kSlowInterval,
                              kRegularInterval};
  while (1) {
    for (int i = 0; i < sizeof(intervals) / sizeof(intervals[0]); i++) {
      unsigned int interval = intervals[i];
      {
        for (int j = 0; j < 3; j++) {
          wait(interval);
          led_on();
          wait(interval);
          led_off();
        }
      }
    }
  }

  return 0;
}

Self and cross development environment for Raspberry Pi2

I bought Rapberry Pi2 and set up self and cross development environment.

1. Self dev env on Raspberry pi
Installed gcc/g++ 4. Used pre-installed gdb ver 7.4 for ARM.

src: main.s

/* main.s */
/* data section */
.data
.balign 4
myvar1:
  .word 3

.balign 4
myvar2:
  .word 4

.balign 4
myvar3:
  .word 0

/* text section */
.text
.balign 4
.global main
.func   main

main:
  ldr r1, addr_of_myvar1
  ldr r1, [r1]
  ldr r2, addr_of_myvar2
  ldr r2, [r2]
  add r2, r1, r2
  ldr r3, addr_of_myvar3
  /* store value of r2 -> address of r3 */
  str r2, [r3]
  /* load [r3] into r0 */
  ldr r0, addr_of_myvar3
  ldr r0, [r0]
  bx lr

gdb test. confirmed r1/r2 are loaded from .data section and added. Return value 7 was successfully returned.

(gdb) start
Temporary breakpoint 1 at 0x8418
Starting program: /home/pi/workspace/asm/tutorial/003/hoge

Temporary breakpoint 1, 0x00008418 in main ()
(gdb) disass
Dump of assembler code for function main:
=> 0x00008418 <+0>:	ldr	r1, [pc, #32]	; 0x8440 <addr_of_myvar1>
   0x0000841c <+4>:	ldr	r1, [r1]
   0x00008420 <+8>:	ldr	r2, [pc, #28]	; 0x8444 <addr_of_myvar2>
   0x00008424 <+12>:	ldr	r2, [r2]
   0x00008428 <+16>:	add	r2, r1, r2
   0x0000842c <+20>:	ldr	r3, [pc, #20]	; 0x8448 <addr_of_myvar3>
   0x00008430 <+24>:	str	r2, [r3]
   0x00008434 <+28>:	ldr	r0, [pc, #12]	; 0x8448 <addr_of_myvar3>
   0x00008438 <+32>:	ldr	r0, [r0]
   0x0000843c <+36>:	bx	lr
End of assembler dump.
(gdb) si
0x0000841c in main ()
(gdb) si
0x00008420 in main ()
(gdb) info r
r0             0x1	1
r1             0x3	3
r2             0x7efff79c	2130704284
r3             0x8418	33816
r4             0x0	0
r5             0x0	0
r6             0x82e4	33508
r7             0x0	0
r8             0x0	0
r9             0x0	0
r10            0x76fff000	1996484608
r11            0x0	0
r12            0x76fb9000	1996197888
sp             0x7efff648	0x7efff648
lr             0x76ea481c	1995065372
pc             0x8420	0x8420 <main+8>
cpsr           0x60000010	1610612752
(gdb) si
0x00008424 in main ()
(gdb) si
0x00008428 in main ()
(gdb) si
0x0000842c in main ()
(gdb) disass
Dump of assembler code for function main:
   0x00008418 <+0>:	ldr	r1, [pc, #32]	; 0x8440 <addr_of_myvar1>
   0x0000841c <+4>:	ldr	r1, [r1]
   0x00008420 <+8>:	ldr	r2, [pc, #28]	; 0x8444 <addr_of_myvar2>
   0x00008424 <+12>:	ldr	r2, [r2]
   0x00008428 <+16>:	add	r2, r1, r2
=> 0x0000842c <+20>:	ldr	r3, [pc, #20]	; 0x8448 <addr_of_myvar3>
   0x00008430 <+24>:	str	r2, [r3]
   0x00008434 <+28>:	ldr	r0, [pc, #12]	; 0x8448 <addr_of_myvar3>
   0x00008438 <+32>:	ldr	r0, [r0]
   0x0000843c <+36>:	bx	lr
End of assembler dump.
(gdb) info r
r0             0x1	1
r1             0x3	3
r2             0x7	7
r3             0x8418	33816
r4             0x0	0
r5             0x0	0
r6             0x82e4	33508
r7             0x0	0
r8             0x0	0
r9             0x0	0
r10            0x76fff000	1996484608
r11            0x0	0
r12            0x76fb9000	1996197888
sp             0x7efff648	0x7efff648
lr             0x76ea481c	1995065372
pc             0x842c	0x842c <main+20>
cpsr           0x60000010	1610612752

2. Cross dev env with ARM simulator on Ubuntu x64
Installed crosstool-NG and configured gcc/g++ 4.9 and gdb 7.8 for ARM with ARM sim.
Since Raspberry Pi hardware’s boot loader loads initial code at 0x8000 (I’ll make bare-metal micro code later), hoge.lds defines it to be loaded there.

@ startup
  .global _start
  .align

_start:
  ldr r0, =0x000000d3
  msr cpsr, r0
  ldr sp, =0x06400000
  bl main
  b .

hoge.lds

OUTPUT_ARCH(arm)
ENTRY(_start)
SECTIONS
{
	. = 0x8000;

	.text : { *(.text*) }
	. = ALIGN(4);

	__rodata_start = .;
	.rodata : { *(.data*) }
	. = ALIGN(4);
	__rodata_end = .;

	__data_start = . ;
	.data : { *(.data*) }
	. = ALIGN(4);
	__data_end = . ;

	__bss_start = . ;
	.bss : { *(.bss*) }
	. = ALIGN(4);
	__bss_end = . ;
}

hoge.c

int main(int argc, char const* argv[]) {
  int a = 3;
  int b = 4;
  return a + b;
}

gdb test with sim. Confirmed return value set to 7.

arm-unknown-eabi-gdb 
GNU gdb (crosstool-NG 1.20.0) 7.8
Copyright (C) 2014 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law.  Type "show copying"
and "show warranty" for details.
This GDB was configured as "--host=x86_64-build_unknown-linux-gnu --target=arm-unknown-eabi".
Type "show configuration" for configuration details.
For bug reporting instructions, please see:
<http://www.gnu.org/software/gdb/bugs/>.
Find the GDB manual and other documentation resources online at:
<http://www.gnu.org/software/gdb/documentation/>.
For help, type "help".
Type "apropos word" to search for commands related to "word".
/home/sokoide/.gdbinit:1: Error in sourced command file:
No symbol table is loaded.  Use the "file" command.
(gdb) target sim
Connected to the simulator.
(gdb) load hoge
Loading section .text, size 0x54 vma 0x8000
Loading section .rodata, size 0x8 vma 0x8054
Start address 0x8000
Transfer rate: 736 bits in <1 sec.
(gdb) file hoge
Reading symbols from hoge...(no debugging symbols found)...done.
(gdb) start
Temporary breakpoint 1 at 0x8014
Starting program: /media/psf/Dropbox/workspace/arm/helloc/hoge 

Temporary breakpoint 1, 0x00008014 in main ()
(gdb) disass
Dump of assembler code for function main:
=> 0x00008014 <+0>:	push	{r11}		; (str r11, [sp, #-4]!)
   0x00008018 <+4>:	add	r11, sp, #0
   0x0000801c <+8>:	sub	sp, sp, #20
   0x00008020 <+12>:	str	r0, [r11, #-16]
   0x00008024 <+16>:	str	r1, [r11, #-20]
   0x00008028 <+20>:	mov	r3, #3
   0x0000802c <+24>:	str	r3, [r11, #-8]
   0x00008030 <+28>:	mov	r3, #4
   0x00008034 <+32>:	str	r3, [r11, #-12]
   0x00008038 <+36>:	ldr	r2, [r11, #-8]
   0x0000803c <+40>:	ldr	r3, [r11, #-12]
   0x00008040 <+44>:	add	r3, r2, r3
   0x00008044 <+48>:	mov	r0, r3
   0x00008048 <+52>:	sub	sp, r11, #0
   0x0000804c <+56>:	pop	{r11}		; (ldr r11, [sp], #4)
   0x00008050 <+60>:	bx	lr
End of assembler dump.
(gdb) info r
r0             0xd3	211
r1             0x0	0
r2             0x0	0
r3             0x0	0
r4             0x0	0
r5             0x0	0
r6             0x0	0
r7             0x0	0
r8             0x0	0
r9             0x0	0
r10            0x0	0
r11            0x0	0
r12            0x0	0
sp             0x6400000	0x6400000
lr             0x8010	32784
pc             0x8014	0x8014 <main>
cpsr           0xd3	211
(gdb) si
0x00008018 in main ()
(gdb) si
0x0000801c in main ()
(gdb) si
0x00008020 in main ()
(gdb) si
0x00008024 in main ()
(gdb) si
0x00008028 in main ()
(gdb) si
0x0000802c in main ()
(gdb) si
0x00008030 in main ()
(gdb) si
0x00008034 in main ()
(gdb) si
0x00008038 in main ()
(gdb) si
0x0000803c in main ()
(gdb) si
0x00008040 in main ()
(gdb) si
0x00008044 in main ()
(gdb) si
0x00008048 in main ()
(gdb) info r
r0             0x7	7
r1             0x0	0
r2             0x3	3
r3             0x7	7
r4             0x0	0
r5             0x0	0
r6             0x0	0
r7             0x0	0
r8             0x0	0
r9             0x0	0
r10            0x0	0
r11            0x63ffffc	104857596
r12            0x0	0
sp             0x63fffe8	0x63fffe8
lr             0x8010	32784
pc             0x8048	0x8048 <main+52>
cpsr           0xd3	211

Make your own OS in 30 days – day17/18: Idle task and console

Day16’s task_a was sleeping until interrupted, and other task_b0,b1,b2 were running. If we don’t have bx tasks, it needs a different logic.
It introduces an idle task to keep the design simple by always having a task than the main task_a.

void task_idle(void)
{
	for (;;) {
		io_hlt();
	}
}

To allow all tasks to get keyboard input, the task manager defines FIFO queue per task. Then added a console which accepts ‘mem’, ‘cls’ and ‘dir’ commands.
‘dir’ checks 8.3 file name and size.

struct FILEINFO *finfo = (struct FILEINFO *) (ADR_DISKIMG + 0x002600);

if (strcmp(cmdline, "mem") == 0) {
	sprintf(s, "total   %dMB", memtotal / (1024 * 1024));
	putfonts8_asc_sht(sheet, 8, cursor_y, COL8_FFFFFF, COL8_000000, s, 30);
	cursor_y = cons_newline(cursor_y, sheet);
	sprintf(s, "free %dKB", memman_total(memman) / 1024);
	putfonts8_asc_sht(sheet, 8, cursor_y, COL8_FFFFFF, COL8_000000, s, 30);
	cursor_y = cons_newline(cursor_y, sheet);
	cursor_y = cons_newline(cursor_y, sheet);
} else if (strcmp(cmdline, "cls") == 0) {
	for (y = 28; y < 28 + 128; y++) {
		for (x = 8; x < 8 + 240; x++) {
			sheet->buf[x + y * sheet->bxsize] = COL8_000000;
		}
	}
	sheet_refresh(sheet, 8, 28, 8 + 240, 28 + 128);
	cursor_y = 28;
} else if (strcmp(cmdline, "dir") == 0) {
	for (x = 0; x < 224; x++) {
		if (finfo[x].name[0] == 0x00) {
			break;
		}
		if (finfo[x].name[0] != 0xe5) {
			if ((finfo[x].type & 0x18) == 0) {
				sprintf(s, "filename.ext   %7d", finfo[x].size);
				for (y = 0; y < 8; y++) {
					s[y] = finfo[x].name[y];
				}
				s[ 9] = finfo[x].ext[0];
				s[10] = finfo[x].ext[1];
				s[11] = finfo[x].ext[2];
				putfonts8_asc_sht(sheet, 8, cursor_y, COL8_FFFFFF, COL8_000000, s, 30);
				cursor_y = cons_newline(cursor_y, sheet);
			}
		}
	}
	cursor_y = cons_newline(cursor_y, sheet);
}

day17_18

1 2 3