Faster JSON parser RapidJSON

I moved from picojson to RapidJSON which is faster and also header-only.

char buffer[target->size() + 1];
memcpy(buffer, target->c_str(), target->size());
memset(buffer + target->size(), 0, 1);  // trailing \0
cerr << boost::format("buffer:'%s'\n") % (const char*)buffer;

if (document.ParseInsitu(buffer).HasParseError()) {
  cerr << "* parse error\n";
  fprintf(stderr, "\nError(offset %u): %s\n",
          (unsigned)document.GetErrorOffset(),
          GetParseError_En(document.GetParseError()));

  return 1;
}

double hoge = document["hoge"].GetDouble();
bool page = document["page"].GetBool();
const string& piyo = document["piyo"].GetString();

const Value& ij = document["innerjson"];
StringBuffer buf;
PrettyWriter<StringBuffer> wr(buf);
ij.Accept(wr);
const char* jsIj = buf.GetString();

cout << boost::format("hoge: %f\n") % hoge;
cout << boost::format("page: %d\n") % page;
cout << boost::format("piyo: %s\n") % piyo;
cout << boost::format("innerjson: %s\n") % jsIj;

Json

{
  "hoge":23.4,
  "page":true,
  "piyo":"piyopiypo",
  "innerjson": {
    "foo": "bar",
    "baz": {
      "bar": 123,
      "bar2": "hogehoge"
    }
  }
}

Premake

I learned premake since libraries I use in my new project requires it. I found these.

  • To change C/C++ compiler, use premake.gcc.cc/cxx or CC/CXX env vars.
  • You can have ‘test’ configuration in a project, but you can’t change ‘files’ per configuration for now (will be supported in premake 4.5). -> I created ‘Test’ project.
  • OSX has a problem in -Wl,-x handling. -> I added ‘Symbols’ in all build targets.

My sample premake.

-- premake4.lua
solution "HogeSolution"
  configurations { "debug", "release" }
  -- switch gcc / clang
  --premake.gcc.cc = "gcc-5"
  --premake.gcc.cxx = "g++-5"
  premake.gcc.cc = "clang-3.6"
  premake.gcc.cxx = "clang++-3.6"

-- per configuration file list is not supported as of 2015.8
-- will be supported in premake 4.5
-- http://stackoverflow.com/questions/9158151/premake-different-excludes-command-for-each-configuration

-- A project defines one build target
  project "Hoge"

    kind "ConsoleApp"
    language "C++"
    buildoptions {
      "-std=c++11",
      "-Wall"
    }
    linkoptions {
      "-stdlib=libc++"
    }

    pchheader "pre.h"

    files {
     "**.h", "**.hpp", "**.cpp"
    }

    excludes {
      "*_test.cpp"
    }

    includedirs {
    }

    libdirs {
      "/usr/local/opt/llvm36/lib/llvm-3.6/lib"
    }

    links {
    }

    configuration "debug"
      defines { "DEBUG" }
      flags { "Symbols" }
      targetname "hoge-d"

    configuration "release"
      -- linker fails if you don't have "Symbols" on osx with -Wl,-x
      flags { "Symbols", "Optimize" }
      targetname "hoge"


  project "HogeTest"
    -- assume you have the following gmake setup
    -- 1. export REPO=your-local-repo-home
    -- 2. cd $REPO
    -- 3. git clone https://github.com/google/googletest
    -- 4. cd $REPO/googletest
    -- 5. cmake .
    -- 6. make

    -- env vars
    local repo = os.getenv("REPO")
    local gtest = repo .. "/googletest"


    kind "ConsoleApp"
    language "C++"
    buildoptions {
      "-std=c++11",
      "-Wall"
    }
    linkoptions {
      "-stdlib=libc++"
    }

    pchheader "pre.h"

    files {
      "**.h", "**.hpp", "**.cpp"
    }

    excludes {
      "main.cpp"
    }

    includedirs {
      gtest .. "/include"
    }

    libdirs {
      "/usr/local/opt/llvm36/lib/llvm-3.6/lib",
      gtest
    }

    links {
      "gtest_main",
      "gtest"
    }

    flags { "Symbols" }
    targetname "hoge-test"

How to run.

premake4 gmake

make Hoge config=debug
make Hoge config=release

make HogeTest
./hoge-test
Running main() from gtest_main.cc
[==========] Running 2 tests from 1 test case.
[----------] Global test environment set-up.
[----------] 2 tests from HogeTest
[ RUN      ] HogeTest.Hoge1
[       OK ] HogeTest.Hoge1 (0 ms)
[ RUN      ] HogeTest.Hoge2
[       OK ] HogeTest.Hoge2 (0 ms)
[----------] 2 tests from HogeTest (0 ms total)

[----------] Global test environment tear-down
[==========] 2 tests from 1 test case ran. (1 ms total)
[  PASSED  ] 2 tests.

sample test:
#include "hoge.hpp"
#include "gtest/gtest.h"

class HogeTest : public ::testing::Test {
  virtual void SetUp() {}
};

TEST_F(HogeTest, Hoge1) { EXPECT_EQ(3, 3); }

TEST_F(HogeTest, Hoge2) {
  Hoge h;
  int result = h.add(-2, 2);
  EXPECT_EQ(0, result);
}

Header only CPP JSON library picojson

Tried a nice JSON library for C++ @ https://github.com/kazuho/picojson.

Json:

{
  "hoge": 12.3,
  "page": true,
  "piyo": "piyopiypo",
  "ar": [
    1,2,3,4,5
  ],
  "ar2": [
    {
      "foo": "foo string",
      "bar": 12345.6
    },
    {
      "foo": "foo2 string",
      "bar": 78.99
    }
  ]
}

Test:

#include "pre.h"
pre.h ***
#define __STDC_WANT_LIB_EXT1__ 1

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include <iostream>
#include <boost/format.hpp>

#define PICOJSON_USE_INT64
#include "deps/picojson.h"

main.cpp ***
#include "pre.h"

using namespace std;

int main(int argc, char const *argv[]) {
  printf("* started.\n");

  picojson::value json;
  cin >> json;

  picojson::object& o = json.get<picojson::object>();
  int hoge = o["hoge"].get<double>();
  bool page = o["page"].get<bool>();
  string piyo = o["piyo"].get<string>();

  cout << boost::format("hoge: %d\n") % hoge;
  cout << boost::format("page: %d\n") % page;
  cout << boost::format("piyo: %s\n") % piyo;

  picojson::array ar = o["ar"].get<picojson::array>();
  for (picojson::array::iterator it = ar.begin(); it != ar.end(); it++)
  {
    cout << boost::format("ar item: %d\n") % it->get<int64_t>();
  }

  picojson::array ar2 = o["ar2"].get<picojson::array>();
  for (picojson::array::iterator it = ar2.begin(); it != ar2.end(); it++)
  {
    picojson::object& item = it->get<picojson::object>();
    cout << boost::format("foo: %s\n") % item["foo"].get<string>();
    cout << boost::format("bar: %f\n") % item["bar"].get<double>();
  }

  printf("* completed.\n");
}

Swift – C++ interop

It looks Swift can’t call C++ directly and need to call it via Objective-C++.

Swift Button Handler calls ObjectiveC class ObjCHoge’s functions as below.

@IBAction func buttonClicked(sender : AnyObject) {
    let date = NSDate()
    let formatter = NSDateFormatter()
    formatter.timeStyle = .LongStyle
    
    let a:Int32 = 10
    let b:Int32 = 20
    let hoge = ObjCHoge()
    
    var str = String(format: "[%@] %d+%d=%d\n", formatter.stringFromDate(date), a, b, hoge.add(a,b))
    insertMessage(str)
    str = String(format: "[%@] %d-%d=%d\n", formatter.stringFromDate(date), a, b, hoge.sub(a,b))
    insertMessage(str)
}

BridgingHeader.h should have pure Objective-C++ header which I think can’t have C++ classes.

#import "ObjCHoge.h"

Then ObjCHoge.h/.mm calls CppHoge methods. I didn’t want to use “void*” for storing CppHoge instance, but it looked ObjCHoge.h should be pure Objective-C++ header and cannot have CHoge* for Swift to call.
Hm… Objective-C (OSX/iOS UI)/C# (Win UI)/Java (Android UI) – C++ (logic) is more handy than Swift (OSX/iOS UI)/C# (Win UI)/Java (Android UI) – C++ (logic) when writing cross platform code for OSX/iOS.

Let me know if there is a better way.

#import <Foundation/Foundation.h>

@interface ObjCHoge : NSObject
{
    // it looks ObjCHoge.h should be pure Objective-C for Swift to call
    // and can't define CppHoge* _cppHoge
    void* _cppHoge;
}

-(int)add:(int)a :(int)b;
-(int)sub:(int)a :(int)b;
-(int)static_add:(int)a :(int)b;
-(int)static_sub:(int)a :(int)b;

@end

#include "ObjCHoge.h"
#include "CppHoge.h"

@implementation ObjCHoge
-(id)init{
    self = [super init];
    
    if (self) {
        _cppHoge = new CppHoge();
    }
    
    return self;
}

-(void)dealloc {
    delete static_cast<CppHoge*>(_cppHoge);
}

-(int)add:(int)a :(int)b {
    CppHoge& obj = *static_cast<CppHoge*>(_cppHoge);
    return obj.add(a, b);
}

-(int)sub:(int)a :(int)b {
    CppHoge& obj = *static_cast<CppHoge*>(_cppHoge);
    return obj.sub(a, b);
}

-(int)static_add:(int)a :(int)b {
    return CppHoge::static_add(a, b);
}

-(int)static_sub:(int)a :(int)b {
    return CppHoge::static_sub(a, b);
}
@end

CppHoge.h/.cpp

//
//  CppHoge.h
//

#ifndef __cp_osx__CppHoge__
#define __cp_osx__CppHoge__

#include <stdio.h>

class CppHoge
{
public:
    int add(int a, int b);
    int sub(int a, int b);
    static int static_add(int a, int b);
    static int static_sub(int a, int b);
};

#endif /* defined(__cp_osx__CppHoge__) */

//
//  CppHoge.cpp
//

#include "CppHoge.h"


int CppHoge::add(int a, int b){
    return a+b;
}

int CppHoge::sub(int a, int b){
    return a-b;
}

int CppHoge::static_add(int a, int b){
    return a+b;
}

int CppHoge::static_sub(int a, int b){
    return a-b;
}

Integrating with ml64.exe

I’m back to Windows.
First, I started with writing simple functions in 32 and 64bit assembly. Visual Studio compiler supports 32bit inline assembly, but doesn’t support 64bit. Today, I used ml.exe/ml64.exe for both.

Here is the caller in C.

// asm.cpp : Defines the entry point for the console application.
//

#include "stdafx.h"

#ifdef __cplusplus
extern "C" {
#endif
  char* _message();
  int _add(int a, int b);
  int _sub(int a, int b);
  int _fib(int a);
#ifdef __cplusplus
}
#endif

int _tmain(int argc, _TCHAR* argv[])
{
  int a = 10;
  int b = 20;

  printf("* started\n");
  int result = _add(a, b);

  printf("%d+%d=%d\n", a, b, result);
  result = _sub(a, b);
  printf("%d-%d=%d\n", a, b, result);

  for (int i = 0; i < 20; i++){
    result = _fib(i);
    printf("fib(%d)=%d\n", i, result);
  }
  printf("%s\n", _message());

  printf("* done\n");
  return 0;
}

That calls these.
asm64.asm:

.data
MSG	DB  "Hoge x64,", 0dh, 0ah, "Page!", 0dh, 0ah, 0

.code
; char* _message()
_message PROC
	mov rax, offset MSG
	ret
_message ENDP

; int _add(int a, int b)
_add PROC
add rcx, rdx			
mov rax, rcx
ret
_add ENDP

; int _sub(int a, int b)
_sub PROC
sub rcx, rdx			
mov rax, rcx
ret
_sub ENDP

; int _fib(int a)
_fib PROC
	cmp rcx, 0
	jz @ret0
	cmp rcx, 1
	jz @ret1

	push r12
	push r13

	mov r12, rcx
	sub r12, 1
	mov rcx, r12
	call _fib
	mov r13, rax

	sub r12, 1h
	mov rcx, r12
	call _fib
	add r13, rax

	mov rax, r13
	pop r13
	pop r12
	ret
@ret0:
	mov rax, 0
ret
@ret1:
	mov rax, 1
ret
_fib ENDP

END

asm32.asm:

.data
MSG	DB  "Hoge x64,", 0dh, 0ah, "Page!", 0dh, 0ah, 0

.code
; char* _message()
_message PROC
	mov rax, offset MSG
	ret
_message ENDP

; int _add(int a, int b)
_add PROC
add rcx, rdx			
mov rax, rcx
ret
_add ENDP

; int _sub(int a, int b)
_sub PROC
sub rcx, rdx			
mov rax, rcx
ret
_sub ENDP

; int _fib(int a)
_fib PROC
	cmp rcx, 0
	jz @ret0
	cmp rcx, 1
	jz @ret1

	push r12
	push r13

	mov r12, rcx
	sub r12, 1
	mov rcx, r12
	call _fib
	mov r13, rax

	sub r12, 1h
	mov rcx, r12
	call _fib
	add r13, rax

	mov rax, r13
	pop r13
	pop r12
	ret
@ret0:
	mov rax, 0
ret
@ret1:
	mov rax, 1
ret
_fib ENDP

END

Configure vcproj to run ml/ml64 using "custom build as below".
asm64_config

64 bit result:

C:\Users\sokoide\Projects\Spike\x64\Debug\asm.exe
* started
10+20=30
10-20=-10
fib(0)=0
fib(1)=1
fib(2)=1
fib(3)=2
fib(4)=3
fib(5)=5
fib(6)=8
fib(7)=13
fib(8)=21
fib(9)=34
fib(10)=55
fib(11)=89
fib(12)=144
fib(13)=233
fib(14)=377
fib(15)=610
fib(16)=987
fib(17)=1597
fib(18)=2584
fib(19)=4181
Hoge x64,
Page!

* done

32 bit result:

>C:\Users\sokoide\Projects\Spike\Debug\asm.exe
* started
10+20=30
10-20=-10
fib(0)=0
fib(1)=1
fib(2)=1
fib(3)=2
fib(4)=3
fib(5)=5
fib(6)=8
fib(7)=13
fib(8)=21
fib(9)=34
fib(10)=55
fib(11)=89
fib(12)=144
fib(13)=233
fib(14)=377
fib(15)=610
fib(16)=987
fib(17)=1597
fib(18)=2584
fib(19)=4181
Hoge x86,
Page!

* done

Preemptive multi-tasking

I changed the previous cooperative threads into preemptive threads.
It changes contexts during timer interrupts every 16ms with non-inteligent round robin scheduling.
I noticed that ARM swaps R13/R14 during IRQ, and needed to go back and force between IRQ and SVC mode to get the original (banked) registers as below. When IRQ_hander C function returns non null, it’ll switch contexts.

_IRQ_iterrupt:
  //-- irq mode
  sub lr, lr, #4 // in IRQ mode, r14_irq(lr_irq) points to PC+#4 in user mode
  // save context
  push {r0-r12, lr} // save user mode registers

  mrs r0, spsr // spsr -> r0
  cps #0x13
  //-- svc mode
  mov r1, sp
  mov r2, lr

  cps #0x12
  //-- irq mode
  push {r0-r2} // save spsr, user mode sp, lr

  // call IRQ_hander(user-mode-sp)
  mov r0, r2
	bl	IRQ_handler
  cmp r0, #0
  bne _IRQ_interrupt_context_switch

  pop {r0-r2} // restore spsr, user mode sp, lr
  msr spsr, r0 // r0 -> spsr
  cps #0x13
  //-- svc mode
  mov sp, r1 // restore sp
  mov lr, r2 // restore lr

  cps #0x12
  //-- irq mode
  pop  {r0-r12,lr}
  movs pc, lr

_IRQ_interrupt_context_switch:
  //-- irq mode
  // r0 is next thread's SP

  pop {r1-r3} // restore spsr, user mode sp, lr

  // save registers in user mode stack
  // r1: user mode cpsr
  // r2: user mode sp
  // r3: user mode lr
  sub r2, r2, #4
  str r1, [r2] // spsr

  ldr r4, [r13, #4*13]
  sub r2, r2, #4
  str r4, [r2] // user mode pc (r14_irq)

  sub r2, r2, #4
  str r3, [r2] // user mode lr

  ldr r4, [r13, #4*12]
  sub r2, r2, #4
  str r4, [r2] // user mode r12

  ldr r4, [r13, #4*11]
  sub r2, r2, #4
  str r4, [r2] // user mode r11

  ldr r4, [r13, #4*10]
  sub r2, r2, #4
  str r4, [r2] // user mode r10

  ldr r4, [r13, #4*9]
  sub r2, r2, #4
  str r4, [r2] // user mode r9

  ldr r4, [r13, #4*8]
  sub r2, r2, #4
  str r4, [r2] // user mode r8

  ldr r4, [r13, #4*7]
  sub r2, r2, #4
  str r4, [r2] // user mode r7

  ldr r4, [r13, #4*6]
  sub r2, r2, #4
  str r4, [r2] // user mode r6

  ldr r4, [r13, #4*5]
  sub r2, r2, #4
  str r4, [r2] // user mode r5

  ldr r4, [r13, #4*4]
  sub r2, r2, #4
  str r4, [r2] // user mode r4

  ldr r4, [r13, #4*3]
  sub r2, r2, #4
  str r4, [r2] // user mode r3

  ldr r4, [r13, #4*2]
  sub r2, r2, #4
  str r4, [r2] // user mode r2

  ldr r4, [r13, #4*1]
  sub r2, r2, #4
  str r4, [r2] // user mode r1

  ldr r4, [r13]
  sub r2, r2, #4
  str r4, [r2] // user mode r0

  mov r4, sp // r4 <- r13_irq

  msr spsr, r1 // r1 -> spsr
  cps #0x13          //@ svc mode
  //-- svc mode
  // change user mode stack to next thread's stack
  mov sp, r0
  // push into r13_irq (r4)
  ldr r2, [sp, #4*14] // user mode pc
  sub r4, r4, #4
  str r2, [r4]
  ldr r2, [sp, #4*15] // spsr
  sub r4, r4, #4
  str r2, [r4]
  // restore registers
  pop {r0-r12,lr}
  add sp, sp, #4*2 // pop pc, spsr

  cps #0x12          //@ irq mode
  //-- irq mode
  sub sp, sp, #4*2
  pop {lr}
  // enable irq and restore spsr
  bic lr, lr, #0x80
  msr spsr, lr // lr -> spsr
  // restore lr (user mode pc)
  pop {lr}
  // discard pushed registers
  add r13, r13, #4*14
  // movs pc, * ... mov's' pc restores status register
  movs pc, lr

I also tried to implement and add critical section but the one in ARM reference’s hanged probably because I haven’t setup MMU.

// http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dht0008a/ch01s03s02.html
.equ  locked,   1
.equ  unlocked, 0

// BUG: lock_mutex hangs
// LDREX doesn't work when MMU is disabled. Don't use this.
// Declare for use from C as extern void lock_mutex(void * mutex);
.global _lock_mutex_mmu
_lock_mutex_mmu:
  LDR     r1, =locked
1:
  LDREX   r2, [r0]
  CMP     r2, r1        // Test if mutex is locked or unlocked
  BEQ     2f
  STREXNE r2, r1, [r0]  // Not locked, attempt to lock it
  CMPNE   r2, #1        // Check if Store-Exclusive failed
  BEQ     1b           // Failed - retry from 1
  // Lock acquired
  DMB                   // Required before accessing protected resource
  BX      lr
2:
// Take appropriate action while waiting for mutex to become unlocked
  //wfi
  nop
  B       1b           // Retry from 1


// BUG: unlock_mutex
// Declare for use from C as extern void unlock_mutex(void * mutex);
.global _unlock_mutex_mmu
_unlock_mutex_mmu:
    LDR     r1, =unlocked
    DMB                   // Required before releasing protected resource
    STR     r1, [r0]      // Unlock mutex
    // SIGNAL_UPDATE: none
    BX      lr


// BUG: SWP version
// http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dht0008a/CJHBGBBJ.html
// still hangs
.global _lock_mutex_swp
_lock_mutex_swp:
    LDR r2, =locked
    SWP r1, r2, [r0]       // Swap R2 with location [R0], [R0] value placed in R1
    CMP r1, r2             // Check if memory value was ‘locked’
    BEQ _lock_mutex_swp     // If so, retry immediately
    BX  lr                 // If not, lock successful, return

// BUG: not really excusive when context swithes after ldr befor str
.global _lock_mutex_simple
_lock_mutex_simple:
  ldr r1, =unlocked
  ldr r3, =locked
  ldr r2, [r0]
  cmp r2, r3
  beq _lock_mutex_simple
  str r1, [r0]
  bx lr

.global _unlock_mutex_simple
_unlock_mutex_simple:
    LDR r1, =unlocked
    STR r1, [r0]           // Write value ‘unlocked’ to location [R0]
    BX  lr

Today’s code -> https://github.com/sokoide/rpi-baremetal -> 009_context_switch2

Learning ARM assembly basics

Before writing more ARM code, I learned it in Internet.

STMFD = store multiple registers full descendent = stmdb (decrement before)
LDMFD = load multiple registers full descendent = ldmia (increment after)
if ! is specified, the result will be written back.

STMFD:

pre:
r1 = 0x1
r2 = 0x2
r13 = 0x00008000

stmfd !r13, {r1, r2}

post:
r1 = 0x1
r2 = 0x2
r13 = 0x00007ff8
mem[0x00007ff8] = 0x00000001
mem[0x00008000] = 0x00000002

LDMFD:

pre:
r1 = 0x0
r2 = 0x0
r13 = 0x00008000
mem[0x00008000] = 0x00000001
mem[0x00008004] = 0x00000002

ldmfd !r13, {r1, r2}

post:
r1 = 0x00000001
r2 = 0x00000002
r13 = 0x0000800c

And some more info.
R0-R3: scratch registers, don’t need to save
R4-R12: callee saved
R13=SP (Stack Pointer) stack
R14=LR (Link Register) to store return address of function call
R15=PC (Program Counter) instruction pointer

If you want to do like this x86,

push 1
push 2
call hoge
// result in EAX

It’ll be like this in ARM.

mov r0, #1
mov r1, #2
bl hoge // set LR=next instruction pointer and branch to hoge
// result in r0

startup.s:

@ startup
  .align

.global _start
_start:
  ldr r0, =0x000000d3
  msr cpsr, r0
  ldr sp, =0x06400000
  bl main
  b .

.global _add
_add:
  add r0, r0, r1
  mov pc, lr

.global _sub
_sub:
  sub r0, r0, r1
  mov pc, lr

.global _fib
_fib:
  stmfd r13!,{r8, r9, lr} // same as stmdb (decrement before) or push, ! means write back in r13

  // return 0 if fib(0)
  mov r8, #0
  cmp r0, #0
  beq _fib_end

  // return 1 if fib(1)
  mov r8, #1
  cmp r0, #2
  ble _fib_end

  // store arg-1 and arg-2 in r0 and r9
  sub r0, r0, #1 // arg-1
  sub r9, r0, #1 // arg-2

  // call fib(arg-1)
  mov r8, #0
  bl _fib
  add r8, r0, r8

  // call fib(arg-2)
  mov r0, r9
  bl _fib
  add r8, r0, r8

_fib_end:
  mov r0, r8
  ldmfd r13!,{r8, r9, lr} // same as ldmia (increment after) or pop, ! means write back in r13
  mov pc, lr

hoge.c:

static const int kA = 3;
static const int kB = 4;

int _add(int a, int b);
int _sub(int a, int b);
int _fib(int a);

int main(int argc, char const* argv[]) {
  int a = kA;
  int b = kB;

  int result;
  result = _add(a, b);
  result = _sub(a, b);
  result = _fib(0);
  result = _fib(1);
  result = _fib(2);
  result = _fib(6);
  result = _fib(10);
}

And tested it with gdb+ARM sim.

$cat startup.gdb
target sim
file hoge
load hoge
b main
run

$arm-unknown-eabi-gdb --command=startup.gdb
...
15        result = _fib(0);
(gdb) n
16        result = _fib(1);
(gdb) p result
$1 = 0
(gdb) n
17        result = _fib(2);
(gdb) p result
$2 = 1
(gdb) n
18        result = _fib(6);
(gdb) p result
$3 = 1
(gdb) n
19        result = _fib(10);
(gdb) p result
$4 = 8
(gdb) n
20      }
(gdb) p result
$5 = 55

Thread and context switch

I implemented a simple context switch for ARM. First, I implemented a THREAD and THREADCTL as below.

*** rpi.h
#define MAX_THREADS 1024
void InitThread();
void CreateThread(void *thread_entry);
void ContextSwitch();

typedef enum {
  THREAD_NONE,
  THREAD_CREATED,
  THREAD_RUNNING,
  THREAD_WAITING
} THREAD_STATE;

typedef struct {
  THREAD_STATE state;
  int *stack;
  unsigned int stackSize;
} THREAD;

typedef struct {
  unsigned int currentId;
  unsigned int length;
  THREAD thread[MAX_THREADS];
} THREADCTL;

extern THREADCTL threadctl;

*** rpi-thread.c
THREADCTL threadctl;

void InitThread() {
  threadctl.length = 1;
  threadctl.thread[0].state = THREAD_RUNNING;
  threadctl.currentId = 0;

  threadctl.thread[0].stack = NULL;
  /* threadctl.thrad[0].stackSize = TBD; */
}

void CreateThread(void *thread_entry) {
  unsigned int id = threadctl.length;
  const unsigned int stackSize = 4096;  // 4096 * sizeof(int) allocated
  int *stackBase;

  threadctl.length++;
  threadctl.thread[id].state = THREAD_CREATED;
  stackBase = (int *)malloc(stackSize * sizeof(int));
  stackBase += stackSize - 1;
  threadctl.thread[id].stack = stackBase-13;
  threadctl.thread[id].stackSize = stackSize;

  // push default registers into the stack which will be poped in
  // _context_switch
  *threadctl.thread[id].stack =
      (int)thread_entry;  // r14: lr (to be stored in pc)
  threadctl.thread[id].stack--;
  *threadctl.thread[id].stack = 0;  // r12
  threadctl.thread[id].stack--;
  *threadctl.thread[id].stack = 0;  // r11
  threadctl.thread[id].stack--;
  *threadctl.thread[id].stack = 0;  // r10
  threadctl.thread[id].stack--;
  *threadctl.thread[id].stack = 0;  // r9
  threadctl.thread[id].stack--;
  *threadctl.thread[id].stack = 0;  // r8
  threadctl.thread[id].stack--;
  *threadctl.thread[id].stack = 0;  // r7
  threadctl.thread[id].stack--;
  *threadctl.thread[id].stack = 0;  // r6
  threadctl.thread[id].stack--;
  *threadctl.thread[id].stack = 0;  // r5
  threadctl.thread[id].stack--;
  *threadctl.thread[id].stack = 0;  // r4
  threadctl.thread[id].stack--;
  *threadctl.thread[id].stack = 0;  // r3
  threadctl.thread[id].stack--;
  *threadctl.thread[id].stack = 0;  // r2
  threadctl.thread[id].stack--;
  *threadctl.thread[id].stack = 0;  // r1
  threadctl.thread[id].stack--;
  *threadctl.thread[id].stack = 0;  // r0
  // don't do stack--!
}

void ContextSwitch() {
  if (threadctl.length <= 1) {
    return;
  }

  unsigned int currentId = threadctl.currentId;
  unsigned int nextId = currentId + 1;

  if (nextId >= threadctl.length) {
    nextId = 0;
  }

  char message[512];
  if (NULL != threadctl.thread[currentId].stack &&
      NULL != threadctl.thread[nextId].stack) {
    sprintf(message, "%d jumping from th:%d@%p to th:%d@%p", timerctl.counter,
            currentId, *threadctl.thread[currentId].stack, nextId,
            *threadctl.thread[nextId].stack);
  } else {
    sprintf(message, "%d jumping from th:%d to th:%d", timerctl.counter,
            currentId, nextId);
  }
  FillRect(0, 16, kWidth, 16, 0);
  PrintStr(0, 16, message, 7);

  // get current sp
  threadctl.thread[currentId].stack = (int *)_get_stack_pointer();

  // do context switch
  threadctl.thread[nextId].state = THREAD_RUNNING;
  threadctl.thread[currentId].state = THREAD_WAITING;
  threadctl.currentId = nextId;
  _context_switch(&threadctl.thread[currentId].stack,
                  &threadctl.thread[nextId].stack);
}

The _context_switch is written in assembly which pushes r0-14 in per-thread stack and save the stack pointer in the per-thread variable ‘stack’.

.global _get_stack_pointer
_get_stack_pointer:
  mov r0, r13
  bx lr
  
.global _context_switch
_context_switch:
  // same as stmfd/stmdb !r13, {...}
  push {r0-r12,r14}
  str sp, [r0]
  ldr sp, [r1]
  // same as ldmfd/ldmia !r13, {...}
  pop {r0-r12}
  pop {pc} // pc points to the previous lr

I use it in main function.

InitThread(); // add the current thread into THREADCTL
CreateThread(task_a); // create a new thread for task_a and start it
CreateThread(task_b); // create a new thread for task_b and start it
...
// switch contexts using timer interrupt
while (true) {
  _disable_IRQ();
  _wfi();
  if (StatusFifo8(&fifoTimer) == 0) {
    _enable_IRQ();
  } else {
    unsigned char data = GetFifo8(&fifoTimer);
    _enable_IRQ();
      
     switch (data) {
      case (const int)timerData1:
        counter1++;
        SetTimer(timer1, timerInterval1, timerData1);
        draw_counter(0, counter1);
        ContextSwitch();
        break;
    }
  }
}
  
void task_a() {
  unsigned int counter = 0;
  while (true) {
    draw_counter(1, counter++);
    ContextSwitch(); // for now, it's changing contexts by the function itself
  }
}

void task_b() {
  unsigned int counter = 0;
  while (true) {
    draw_counter(2, counter++);
    ContextSwitch(); // for now, it's changing contexts by the function itself
  }
}

Tested it and worked!
Today’s task_a/b have ContextSwitch() explicitly. I’ll make the context switch full automatic in the next post.

https://github.com/sokoide/rpi-baremetal -> 008_context_switch.

context_switch

Unittest and refactoring for RPi baremetal

I wanted to change the previous array based timer to a list based one, and wanted to write a unittest for the list before implementing.
So, added googletest git repo from Chromium repo as a submodule and created Test.mak for unittest on the build machine for the build machine architecture (OSX, x64, mach-o), then refactored the previous 007-wfi with it for the target architecture (RPi2, cortex-a7, elf).

https://github.com/sokoide/rpi-baremetal -> 007_wfi

make -f Test.mak test
[==========] Running 9 tests from 2 test cases.
[----------] Global test environment set-up.
[----------] 5 tests from Fifo8Case
[ RUN      ] Fifo8Case.Init
[       OK ] Fifo8Case.Init (0 ms)
[ RUN      ] Fifo8Case.Put
[       OK ] Fifo8Case.Put (0 ms)
[ RUN      ] Fifo8Case.PutGet
[       OK ] Fifo8Case.PutGet (0 ms)
[ RUN      ] Fifo8Case.Put4Get2
[       OK ] Fifo8Case.Put4Get2 (0 ms)
[ RUN      ] Fifo8Case.PutOverflow
[       OK ] Fifo8Case.PutOverflow (0 ms)
[----------] 5 tests from Fifo8Case (0 ms total)

[----------] 4 tests from TimerCase
[ RUN      ] TimerCase.InitTimerCtl
[       OK ] TimerCase.InitTimerCtl (0 ms)
[ RUN      ] TimerCase.InsertTimer1
[       OK ] TimerCase.InsertTimer1 (0 ms)
[ RUN      ] TimerCase.InsertTimer5
[       OK ] TimerCase.InsertTimer5 (0 ms)
[ RUN      ] TimerCase.RemoveTimer
[       OK ] TimerCase.RemoveTimer (0 ms)
[----------] 4 tests from TimerCase (0 ms total)

[----------] Global test environment tear-down
[==========] 9 tests from 2 test cases ran. (0 ms total)
[  PASSED  ] 9 tests.

WFI and timer change

Before implementing context switch, I wanted to improve interrupt handling and better timer.
I was using busy loop to check messages from interrupt. First, I used WFI ARM instruction to wait (sleep) until interrupted.

startup.s:
.global _wfi
_wfi:
  wfi
  bx lr

hoge.c:
while (true) {
  _disable_IRQ();
  _wfi();
  if (StatusFifo8(&fifoTimer) == 0) {
    _enable_IRQ();
  } else {
    unsigned char data = GetFifo8(&fifoTimer);
    _enable_IRQ();
...

And I used FIFO8 used in Haribote OS (32bit tiny OS for x86).

Then quickly wrote dirty timer handling which supports up to MAX_TIMER (currently 512). It’s not using a list but an array and not efficient -> TODO.

#define MAX_TIMER 512
typedef struct _TIMER {
  unsigned int timeout;
  unsigned char data;
} TIMER;

typedef struct _TIMERCTL {
  unsigned int counter, next;
  unsigned int length;  // number of used timers
  FIFO8 *fifo;
  TIMER timer[MAX_TIMER];
} TIMERCTL;

extern TIMERCTL timerctl;

void InitTimer(FIFO8 *fifo);
TIMER *SetTimer(TIMER *timer, unsigned int timeout, unsigned char data);

One more improvement. I bought a USB power cable with a switch! I don’t need to plug-in/out to reboot my code anymore 🙂

Today’s code -> https://github.com/sokoide/rpi-baremetal -> 007_wfi.

USB power cable with switch.
usbcable

3 timers whose intervals are 100ms, 500ms and 1s each.
timer2

1 2 3 8