Preemptive multi-tasking

I changed the previous cooperative threads into preemptive threads.
It changes contexts during timer interrupts every 16ms with non-inteligent round robin scheduling.
I noticed that ARM swaps R13/R14 during IRQ, and needed to go back and force between IRQ and SVC mode to get the original (banked) registers as below. When IRQ_hander C function returns non null, it’ll switch contexts.

  //-- irq mode
  sub lr, lr, #4 // in IRQ mode, r14_irq(lr_irq) points to PC+#4 in user mode
  // save context
  push {r0-r12, lr} // save user mode registers

  mrs r0, spsr // spsr -> r0
  cps #0x13
  //-- svc mode
  mov r1, sp
  mov r2, lr

  cps #0x12
  //-- irq mode
  push {r0-r2} // save spsr, user mode sp, lr

  // call IRQ_hander(user-mode-sp)
  mov r0, r2
	bl	IRQ_handler
  cmp r0, #0
  bne _IRQ_interrupt_context_switch

  pop {r0-r2} // restore spsr, user mode sp, lr
  msr spsr, r0 // r0 -> spsr
  cps #0x13
  //-- svc mode
  mov sp, r1 // restore sp
  mov lr, r2 // restore lr

  cps #0x12
  //-- irq mode
  pop  {r0-r12,lr}
  movs pc, lr

  //-- irq mode
  // r0 is next thread's SP

  pop {r1-r3} // restore spsr, user mode sp, lr

  // save registers in user mode stack
  // r1: user mode cpsr
  // r2: user mode sp
  // r3: user mode lr
  sub r2, r2, #4
  str r1, [r2] // spsr

  ldr r4, [r13, #4*13]
  sub r2, r2, #4
  str r4, [r2] // user mode pc (r14_irq)

  sub r2, r2, #4
  str r3, [r2] // user mode lr

  ldr r4, [r13, #4*12]
  sub r2, r2, #4
  str r4, [r2] // user mode r12

  ldr r4, [r13, #4*11]
  sub r2, r2, #4
  str r4, [r2] // user mode r11

  ldr r4, [r13, #4*10]
  sub r2, r2, #4
  str r4, [r2] // user mode r10

  ldr r4, [r13, #4*9]
  sub r2, r2, #4
  str r4, [r2] // user mode r9

  ldr r4, [r13, #4*8]
  sub r2, r2, #4
  str r4, [r2] // user mode r8

  ldr r4, [r13, #4*7]
  sub r2, r2, #4
  str r4, [r2] // user mode r7

  ldr r4, [r13, #4*6]
  sub r2, r2, #4
  str r4, [r2] // user mode r6

  ldr r4, [r13, #4*5]
  sub r2, r2, #4
  str r4, [r2] // user mode r5

  ldr r4, [r13, #4*4]
  sub r2, r2, #4
  str r4, [r2] // user mode r4

  ldr r4, [r13, #4*3]
  sub r2, r2, #4
  str r4, [r2] // user mode r3

  ldr r4, [r13, #4*2]
  sub r2, r2, #4
  str r4, [r2] // user mode r2

  ldr r4, [r13, #4*1]
  sub r2, r2, #4
  str r4, [r2] // user mode r1

  ldr r4, [r13]
  sub r2, r2, #4
  str r4, [r2] // user mode r0

  mov r4, sp // r4 <- r13_irq

  msr spsr, r1 // r1 -> spsr
  cps #0x13          //@ svc mode
  //-- svc mode
  // change user mode stack to next thread's stack
  mov sp, r0
  // push into r13_irq (r4)
  ldr r2, [sp, #4*14] // user mode pc
  sub r4, r4, #4
  str r2, [r4]
  ldr r2, [sp, #4*15] // spsr
  sub r4, r4, #4
  str r2, [r4]
  // restore registers
  pop {r0-r12,lr}
  add sp, sp, #4*2 // pop pc, spsr

  cps #0x12          //@ irq mode
  //-- irq mode
  sub sp, sp, #4*2
  pop {lr}
  // enable irq and restore spsr
  bic lr, lr, #0x80
  msr spsr, lr // lr -> spsr
  // restore lr (user mode pc)
  pop {lr}
  // discard pushed registers
  add r13, r13, #4*14
  // movs pc, * ... mov's' pc restores status register
  movs pc, lr

I also tried to implement and add critical section but the one in ARM reference’s hanged probably because I haven’t setup MMU.

.equ  locked,   1
.equ  unlocked, 0

// BUG: lock_mutex hangs
// LDREX doesn't work when MMU is disabled. Don't use this.
// Declare for use from C as extern void lock_mutex(void * mutex);
.global _lock_mutex_mmu
  LDR     r1, =locked
  LDREX   r2, [r0]
  CMP     r2, r1        // Test if mutex is locked or unlocked
  BEQ     2f
  STREXNE r2, r1, [r0]  // Not locked, attempt to lock it
  CMPNE   r2, #1        // Check if Store-Exclusive failed
  BEQ     1b           // Failed - retry from 1
  // Lock acquired
  DMB                   // Required before accessing protected resource
  BX      lr
// Take appropriate action while waiting for mutex to become unlocked
  B       1b           // Retry from 1

// BUG: unlock_mutex
// Declare for use from C as extern void unlock_mutex(void * mutex);
.global _unlock_mutex_mmu
    LDR     r1, =unlocked
    DMB                   // Required before releasing protected resource
    STR     r1, [r0]      // Unlock mutex
    // SIGNAL_UPDATE: none
    BX      lr

// BUG: SWP version
// still hangs
.global _lock_mutex_swp
    LDR r2, =locked
    SWP r1, r2, [r0]       // Swap R2 with location [R0], [R0] value placed in R1
    CMP r1, r2             // Check if memory value was ‘locked’
    BEQ _lock_mutex_swp     // If so, retry immediately
    BX  lr                 // If not, lock successful, return

// BUG: not really excusive when context swithes after ldr befor str
.global _lock_mutex_simple
  ldr r1, =unlocked
  ldr r3, =locked
  ldr r2, [r0]
  cmp r2, r3
  beq _lock_mutex_simple
  str r1, [r0]
  bx lr

.global _unlock_mutex_simple
    LDR r1, =unlocked
    STR r1, [r0]           // Write value ‘unlocked’ to location [R0]
    BX  lr

Today’s code -> -> 009_context_switch2

Thread and context switch

I implemented a simple context switch for ARM. First, I implemented a THREAD and THREADCTL as below.

*** rpi.h
#define MAX_THREADS 1024
void InitThread();
void CreateThread(void *thread_entry);
void ContextSwitch();

typedef enum {

typedef struct {
  int *stack;
  unsigned int stackSize;

typedef struct {
  unsigned int currentId;
  unsigned int length;

extern THREADCTL threadctl;

*** rpi-thread.c
THREADCTL threadctl;

void InitThread() {
  threadctl.length = 1;
  threadctl.thread[0].state = THREAD_RUNNING;
  threadctl.currentId = 0;

  threadctl.thread[0].stack = NULL;
  /* threadctl.thrad[0].stackSize = TBD; */

void CreateThread(void *thread_entry) {
  unsigned int id = threadctl.length;
  const unsigned int stackSize = 4096;  // 4096 * sizeof(int) allocated
  int *stackBase;

  threadctl.thread[id].state = THREAD_CREATED;
  stackBase = (int *)malloc(stackSize * sizeof(int));
  stackBase += stackSize - 1;
  threadctl.thread[id].stack = stackBase-13;
  threadctl.thread[id].stackSize = stackSize;

  // push default registers into the stack which will be poped in
  // _context_switch
  *threadctl.thread[id].stack =
      (int)thread_entry;  // r14: lr (to be stored in pc)
  *threadctl.thread[id].stack = 0;  // r12
  *threadctl.thread[id].stack = 0;  // r11
  *threadctl.thread[id].stack = 0;  // r10
  *threadctl.thread[id].stack = 0;  // r9
  *threadctl.thread[id].stack = 0;  // r8
  *threadctl.thread[id].stack = 0;  // r7
  *threadctl.thread[id].stack = 0;  // r6
  *threadctl.thread[id].stack = 0;  // r5
  *threadctl.thread[id].stack = 0;  // r4
  *threadctl.thread[id].stack = 0;  // r3
  *threadctl.thread[id].stack = 0;  // r2
  *threadctl.thread[id].stack = 0;  // r1
  *threadctl.thread[id].stack = 0;  // r0
  // don't do stack--!

void ContextSwitch() {
  if (threadctl.length <= 1) {

  unsigned int currentId = threadctl.currentId;
  unsigned int nextId = currentId + 1;

  if (nextId >= threadctl.length) {
    nextId = 0;

  char message[512];
  if (NULL != threadctl.thread[currentId].stack &&
      NULL != threadctl.thread[nextId].stack) {
    sprintf(message, "%d jumping from th:%d@%p to th:%d@%p", timerctl.counter,
            currentId, *threadctl.thread[currentId].stack, nextId,
  } else {
    sprintf(message, "%d jumping from th:%d to th:%d", timerctl.counter,
            currentId, nextId);
  FillRect(0, 16, kWidth, 16, 0);
  PrintStr(0, 16, message, 7);

  // get current sp
  threadctl.thread[currentId].stack = (int *)_get_stack_pointer();

  // do context switch
  threadctl.thread[nextId].state = THREAD_RUNNING;
  threadctl.thread[currentId].state = THREAD_WAITING;
  threadctl.currentId = nextId;

The _context_switch is written in assembly which pushes r0-14 in per-thread stack and save the stack pointer in the per-thread variable ‘stack’.

.global _get_stack_pointer
  mov r0, r13
  bx lr
.global _context_switch
  // same as stmfd/stmdb !r13, {...}
  push {r0-r12,r14}
  str sp, [r0]
  ldr sp, [r1]
  // same as ldmfd/ldmia !r13, {...}
  pop {r0-r12}
  pop {pc} // pc points to the previous lr

I use it in main function.

InitThread(); // add the current thread into THREADCTL
CreateThread(task_a); // create a new thread for task_a and start it
CreateThread(task_b); // create a new thread for task_b and start it
// switch contexts using timer interrupt
while (true) {
  if (StatusFifo8(&fifoTimer) == 0) {
  } else {
    unsigned char data = GetFifo8(&fifoTimer);
     switch (data) {
      case (const int)timerData1:
        SetTimer(timer1, timerInterval1, timerData1);
        draw_counter(0, counter1);
void task_a() {
  unsigned int counter = 0;
  while (true) {
    draw_counter(1, counter++);
    ContextSwitch(); // for now, it's changing contexts by the function itself

void task_b() {
  unsigned int counter = 0;
  while (true) {
    draw_counter(2, counter++);
    ContextSwitch(); // for now, it's changing contexts by the function itself

Tested it and worked!
Today’s task_a/b have ContextSwitch() explicitly. I’ll make the context switch full automatic in the next post. -> 008_context_switch.


Unittest and refactoring for RPi baremetal

I wanted to change the previous array based timer to a list based one, and wanted to write a unittest for the list before implementing.
So, added googletest git repo from Chromium repo as a submodule and created Test.mak for unittest on the build machine for the build machine architecture (OSX, x64, mach-o), then refactored the previous 007-wfi with it for the target architecture (RPi2, cortex-a7, elf). -> 007_wfi

make -f Test.mak test
[==========] Running 9 tests from 2 test cases.
[----------] Global test environment set-up.
[----------] 5 tests from Fifo8Case
[ RUN      ] Fifo8Case.Init
[       OK ] Fifo8Case.Init (0 ms)
[ RUN      ] Fifo8Case.Put
[       OK ] Fifo8Case.Put (0 ms)
[ RUN      ] Fifo8Case.PutGet
[       OK ] Fifo8Case.PutGet (0 ms)
[ RUN      ] Fifo8Case.Put4Get2
[       OK ] Fifo8Case.Put4Get2 (0 ms)
[ RUN      ] Fifo8Case.PutOverflow
[       OK ] Fifo8Case.PutOverflow (0 ms)
[----------] 5 tests from Fifo8Case (0 ms total)

[----------] 4 tests from TimerCase
[ RUN      ] TimerCase.InitTimerCtl
[       OK ] TimerCase.InitTimerCtl (0 ms)
[ RUN      ] TimerCase.InsertTimer1
[       OK ] TimerCase.InsertTimer1 (0 ms)
[ RUN      ] TimerCase.InsertTimer5
[       OK ] TimerCase.InsertTimer5 (0 ms)
[ RUN      ] TimerCase.RemoveTimer
[       OK ] TimerCase.RemoveTimer (0 ms)
[----------] 4 tests from TimerCase (0 ms total)

[----------] Global test environment tear-down
[==========] 9 tests from 2 test cases ran. (0 ms total)
[  PASSED  ] 9 tests.

WFI and timer change

Before implementing context switch, I wanted to improve interrupt handling and better timer.
I was using busy loop to check messages from interrupt. First, I used WFI ARM instruction to wait (sleep) until interrupted.

.global _wfi
  bx lr

while (true) {
  if (StatusFifo8(&fifoTimer) == 0) {
  } else {
    unsigned char data = GetFifo8(&fifoTimer);

And I used FIFO8 used in Haribote OS (32bit tiny OS for x86).

Then quickly wrote dirty timer handling which supports up to MAX_TIMER (currently 512). It’s not using a list but an array and not efficient -> TODO.

#define MAX_TIMER 512
typedef struct _TIMER {
  unsigned int timeout;
  unsigned char data;

typedef struct _TIMERCTL {
  unsigned int counter, next;
  unsigned int length;  // number of used timers
  FIFO8 *fifo;

extern TIMERCTL timerctl;

void InitTimer(FIFO8 *fifo);
TIMER *SetTimer(TIMER *timer, unsigned int timeout, unsigned char data);

One more improvement. I bought a USB power cable with a switch! I don’t need to plug-in/out to reboot my code anymore 🙂

Today’s code -> -> 007_wfi.

USB power cable with switch.

3 timers whose intervals are 100ms, 500ms and 1s each.

Frames per second

Changed to display FPS using yesterday’s timer interrupt.
When I tested, it was 120 FPS @ 640×480, 16-17 FPS @ 1920 x 1080 just to fill whole display with a solid color.
Then I added FillRect() and it decreased to 95 FPS @ 640×480 with 3 rectangles.

It’ll be a bit better if I add 8×8 between 32×32 and 1×1, but I didn’t do that today. Even with that, it looks this pixel by pixel draw is too slow if I add more objects. Perhaps I should do block-copy of tiles and sprites.

void FillRect(int x, int y, int width, int height, char color) {
  // left ... draw pixel by pixel
  int x1 = x;
  int x2 = ((x + 31) / 32) * 32;
  if (x1 != x2) {
    char *p8 = (char *)fbRequest.fbBaseAddress;
    p8 += y * kWidth + x1;
    for (int yy = y; yy < y + height; yy++) {
      for (int xx = x1; xx < x2; xx++) {
        *p8++ = color;
      p8 += kWidth - (x2 - x1);

  // center draw 32 pixels at once
  int x3 = ((x + width) / 32) * 32;
  if (x2 != x3) {
    uint32_t *p = (uint32_t *)fbRequest.fbBaseAddress;
    p += (y * kWidth + x2) / 4;
    for (int yy = y; yy < y + height; yy++) {
      for (int xx = x2; xx < x3; xx += 4 * 8) {
        int32_t c = color << 24 | color << 16 | color << 8 | color;
        *p++ = c;
        *p++ = c;
        *p++ = c;
        *p++ = c;
        *p++ = c;
        *p++ = c;
        *p++ = c;
        *p++ = c;
      p += (kWidth - (x3 - x2)) / 4;

  // right ... draw pixel by pixel
  int x4 = x + width;
  if (x3 != x4) {
    char *p8 = (char *)fbRequest.fbBaseAddress;
    p8 += y * kWidth + x3;
    for (int yy = y; yy < y + height; yy++) {
      for (int xx = x3; xx < x4; xx++) {
        *p8++ = color;
      p8 += kWidth - (x4 - x3);

Today's code: -> 006_fps.


Print string

I drew a string using hankaku.bin 8×16 font provided by the X86 OS book.
First, I converted the 4096 byte bin file into elf format.

hankaku.o: hankaku.bin                                                                                     
  $(OBJCOPY) -I binary -O elf32-littlearm -B arm $< $@                                                     

when you convert that way, you can refer to the address in .data section by _hankaku_bin_obj_start as below.

void printstr(int x, int y, char *str, char color) {
  size_t len = strlen(str);
  for (int i = 0; i < len; i++) {
    char c = str[i];
    myputchar(x + i * 8, y, c, color);

void myputchar(int x, int y, char c, char color) {
  char *hankaku = (char *)&_binary_hankaku_bin_start;
  char *base = (char *)fbRequest.fbBaseAddress;
  char *p;
  char d;

  for (int i = 0; i < 16; i++) {
    p = base + (y + i) * kWidth + x;
    d = hankaku[c * 16 + i];

And it's displayed by this!

  printstr(10, 0, "HOG", 7);
  printstr(10 + 8 * 3, 0, "E", 1);

Today's code: -> 005_character.


Timer interrupt

Implemented timer. First, it needs interrupt vector table at address 0x00000000.
The following (not optimized my first) arm code “_init_vector_table” copies _initialize_vector_start to _initialize_vector_end to 0x00000000.

First, I tried to copy that in C, but uint32_t* vector_table; *vector_table=… was not compiled as I expected by cross gcc 4.9 perhaps because 0 is NULL in C and *NULL is illegal.

.global  _initialize_vector_start
        ldr     pc, _vec_Reset
        ldr     pc, _vec_Undef
        ldr     pc, _vec_SWI
        ldr     pc, _vec_PrefAbort
        ldr     pc, _vec_DataAbort
        ldr     pc, _vec_Reserved
        ldr     pc, _vec_IRQ
        ldr     pc, _vec_FIQ
_vec_Reset:             .word   _start
_vec_Undef:             .word   _hangup
_vec_SWI:               .word   _hangup
_vec_PrefAbort: .word   _hangup
_vec_DataAbort: .word   _hangup
_vec_Reserved:  .word   _hangup
_vec_IRQ:               .word   _IRQ_iterrupt
_vec_FIQ:               .word   _hangup
        .global  _initialize_vector_end
# dummy instruction to keep initialize_vector_end label
        mov r0,r0

.global _init_vector_table
  ldr r0, =0x0
  ldr r1, =_initialize_vector_start
  ldr r3, =_initialize_vector_end
  cmp r1, r3
  bxeq lr
  ldr r2, [r1]
  str r2, [r0]
  add r1, #4
  add r0, #4
  cmp r1, r3
  bne _init_vector_table_loop
  bx lr

Then, _IRQ_iterrupt calls C function IRQ_handler.

	stmfd	r13!, {r0-r12,lr}
	bl	IRQ_handler
	ldmfd	r13!, {r0-r12,lr}
	subs	pc,lr, #4

IRQ_handler sets g_interrupt flag.

volatile static bool g_interrupt = false;

// called by _IRQ_interrupt in startup.s
void IRQ_handler(void) {
  // disable IRQ

  if (*INTERRUPT_IRQ_BASIC_PENDING & 0x01 != 0) {
    // Timer interrupt handler
    g_interrupt = true;
    // clear interrupt flag
    *TIMER_IRQ_CLR = 0;

  // enable IRQ

Then dirty main loop is checking the flag. I’ll change it to wait for interrupt like another x86 OS is doing.

  while (true) {
    if (true == g_interrupt) {
      g_interrupt = false;
    } else {

Today’s code: -> 004_timer_interrupt.

Draw pixels

Got VRAM frame buffer address from Video Core via MailBox protocol and drew 1920×1080 pixels. I found it’s very slow ~3-5 fps.
Then I changed it to 640×480 which made it faster but not 60fps.
I’ll need to initialize interrupts & use timer and add fonts to show FPS tomorrrow.

Buffer address specified by MailBox’s property-tag protocol should be 16 byte aligned since the least 4 bits are used to specify tag (7 for ARM to VC, 8 for VC to ARM) like this.

struct FramebufferRequest {
  uint32_t size;
  uint32_t bufferRequestResponseCode;

  // Set_Physical_Display
  uint32_t tag_setPd;
  uint32_t size_setPd;
  uint32_t rr_setPd;
  uint32_t width_setPd;
  uint32_t height_setPd;
} fbRequest __attribute__((aligned(16)));

Today’s code: -> 003_screen.


Implementing syscalls for libc

If you try to use C-library functions such as malloc() or printf(), link fails with these since they depends on these system calls.

/home/sokoide/cross/rpi/arm-unknown-eabi/arm-unknown-eabi/lib//libc.a(lib_a-sbrkr.o): In function `_sbrk_r':
sbrkr.c:(.text+0x18): undefined reference to `_sbrk'
/home/sokoide/cross/rpi/arm-unknown-eabi/arm-unknown-eabi/lib//libc.a(lib_a-writer.o): In function `_write_r':
writer.c:(.text+0x20): undefined reference to `_write'
/home/sokoide/cross/rpi/arm-unknown-eabi/arm-unknown-eabi/lib//libc.a(lib_a-closer.o): In function `_close_r':
closer.c:(.text+0x18): undefined reference to `_close'
/home/sokoide/cross/rpi/arm-unknown-eabi/arm-unknown-eabi/lib//libc.a(lib_a-fstatr.o): In function `_fstat_r':
fstatr.c:(.text+0x1c): undefined reference to `_fstat'
/home/sokoide/cross/rpi/arm-unknown-eabi/arm-unknown-eabi/lib//libc.a(lib_a-isattyr.o): In function `_isatty_r':
isattyr.c:(.text+0x18): undefined reference to `_isatty'
/home/sokoide/cross/rpi/arm-unknown-eabi/arm-unknown-eabi/lib//libc.a(lib_a-lseekr.o): In function `_lseek_r':
lseekr.c:(.text+0x20): undefined reference to `_lseek'
/home/sokoide/cross/rpi/arm-unknown-eabi/arm-unknown-eabi/lib//libc.a(lib_a-readr.o): In function `_read_r':
readr.c:(.text+0x20): undefined reference to `_read'

We can’t use OS’s syscalls obviously since there is no OS. The book suggested to implement it by reffering to ~/cross/src/ct-ng_rpi/.build/src/newlib-1.20.0/newlib/libc/sys/arm/syscalls.c.

I implemented a stub and wrote _write as below. Since there is no stdout yet, I only copied it in memory and confirmed it in gdb + arm sim.

_write in syscalls.c:

int _write(int file, char *ptr, int len) {
  // TODO:
  for (int i = 0; i < len; i++) {
  return len;

_writememory in startup.s: only to confirm it's called with a right character

  bx  lr

confirmation: tried to confirm with arm sim on gdb, and got this. It looks it's not implemented in arm sim.

(gdb) target sim
Connected to the simulator.
(gdb) load hoge.elf
Loading section .text, size 0x8cf0 vma 0x8000
Loading section .rodata, size 0xa48 vma 0x10cf0
Loading section .rodata.str1.4, size 0xc vma 0x11738
Loading section .rodata.str1.1, size 0x4b vma 0x11744
Loading section .ARM.exidx, size 0x8 vma 0x11790
Start address 0x8000
Transfer rate: 310456 bits in <1 sec.
(gdb) file hoge.elf
Reading symbols from hoge.elf...done.
(gdb) b _write_memory
Breakpoint 1 at 0x8020
(gdb) start
Temporary breakpoint 2 at 0x808c
Starting program: /media/psf/Dropbox/workspace/arm/rpi-baremetal/002/hoge.elf 

Temporary breakpoint 2, 0x0000808c in main ()
(gdb) c
Unhandled v6 thumb insn: 4601
[Inferior 1 (process 42000) exited with code 0215]

Uploaded above at -> 002_libcstub.

1 2