Some interesting tidbits about LLVM

LLVM definitely does some interesting things as part of its toolchain.

Consider the humble HelloWorld:

   1: #include <stdio.h>
   2:  
   3: int main() {
   4:   printf("hello world\n");
   5:   return 0;
   6: }

Assuming you have a functioning llvm and llvm-gcc working on your system, you can compile it into LLVM bitcode. This bitcode is directly executable using the lli.exe from llvm:

$ lli < hello.bc
hello world

Meh. Not so interesting. Let's look at the LLVM bitcode for the code, though--that's interesting as a first peek at what LLVM bitcode might look like:

   1: ; ModuleID = '<stdin>'
   2: target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64"
   3: target triple = "mingw32"
   4: @.str = internal constant [12 x i8] c"hello world\00"        ; <[12 x i8]*> [#uses=1] 
   5:  
   6: define i32 @main() {
   7: entry:
   8:     %tmp2 = tail call i32 @puts( i8* getelementptr ([12 x i8]* @.str, i32 0, i32 0) )        ; <i32> [#uses=0]
   9:     ret i32 0
  10: } 
  11:  
  12: declare i32 @puts(i8*)

Hmm. Now of course, LLVM also has to be able to get down to actual machine instructions, and in point of fact there is a tool in the LLVM toolchain, called llc, that can do this transformation ahead-of-time, like so:

$ llc hello.bc -o hello.bc.s -march x86

And, looking at the results, we see...

   1: .text
   2: .align    16
   3: .globl    _main
   4: .def     _main;    .scl    2;    .type    32;    .endef
   5: n:
   6: pushl    %ebp
   7: movl    %esp, %ebp
   8: subl    $8, %esp
   9: andl    $4294967280, %esp
  10: movl    $16, %eax
  11: call    __alloca
  12: call    ___main
  13: movl    $_.str, (%esp)
  14: call    _puts
  15: xorl    %eax, %eax
  16: movl    %ebp, %esp
  17: popl    %ebp
  18: ret
  19: .data
  20: r:                # .str
  21: .asciz    "hello world"
  22: .def     _puts;    .scl    2;    .type    32;    .endef

Bleah. Assembly language, and in NASM format, to boot. (What did you expect, anyway?)

Of course, assembly language and C were always considered fairly close together in terms of their abstraction layer (C was designed as a replacement for assembly language when porting Unix, remember), so it might not be too hard to...

$ llc hello.bc -o hello.bc.c -march c

And get...

   1: /* Provide Declarations */
   2: #include <stdarg.h>
   3: #include <setjmp.h>
   4: /* get a declaration for alloca */
   5: #if defined(__CYGWIN__) || defined(__MINGW32__)
   6: #define  alloca(x) __builtin_alloca((x))
   7: #define _alloca(x) __builtin_alloca((x))
   8: #elif defined(__APPLE__)
   9: extern void *__builtin_alloca(unsigned long);
  10: #define alloca(x) __builtin_alloca(x)
  11: #define longjmp _longjmp
  12: #define setjmp _setjmp
  13: #elif defined(__sun__)
  14: #if defined(__sparcv9)
  15: extern void *__builtin_alloca(unsigned long);
  16: #else
  17: extern void *__builtin_alloca(unsigned int);
  18: #endif
  19: #define alloca(x) __builtin_alloca(x)
  20: #elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__)
  21: #define alloca(x) __builtin_alloca(x)
  22: #elif defined(_MSC_VER)
  23: #define inline _inline
  24: #define alloca(x) _alloca(x)
  25: #else
  26: #include <alloca.h>
  27: #endif
  28:  
  29: #ifndef __GNUC__  /* Can only support "linkonce" vars with GCC */
  30: #define __attribute__(X)
  31: #endif
  32:  
  33: #if defined(__GNUC__) && defined(__APPLE_CC__)
  34: #define __EXTERNAL_WEAK__ __attribute__((weak_import))
  35: #elif defined(__GNUC__)
  36: #define __EXTERNAL_WEAK__ __attribute__((weak))
  37: #else
  38: #define __EXTERNAL_WEAK__
  39: #endif
  40:  
  41: #if defined(__GNUC__) && defined(__APPLE_CC__)
  42: #define __ATTRIBUTE_WEAK__
  43: #elif defined(__GNUC__)
  44: #define __ATTRIBUTE_WEAK__ __attribute__((weak))
  45: #else
  46: #define __ATTRIBUTE_WEAK__
  47: #endif
  48:  
  49: #if defined(__GNUC__)
  50: #define __HIDDEN__ __attribute__((visibility("hidden")))
  51: #endif
  52:  
  53: #ifdef __GNUC__
  54: #define LLVM_NAN(NanStr)   __builtin_nan(NanStr)   /* Double */
  55: #define LLVM_NANF(NanStr)  __builtin_nanf(NanStr)  /* Float */
  56: #define LLVM_NANS(NanStr)  __builtin_nans(NanStr)  /* Double */
  57: #define LLVM_NANSF(NanStr) __builtin_nansf(NanStr) /* Float */
  58: #define LLVM_INF           __builtin_inf()         /* Double */
  59: #define LLVM_INFF          __builtin_inff()        /* Float */
  60: #define LLVM_PREFETCH(addr,rw,locality) __builtin_prefetch(addr,rw,locality)
  61: #define __ATTRIBUTE_CTOR__ __attribute__((constructor))
  62: #define __ATTRIBUTE_DTOR__ __attribute__((destructor))
  63: #define LLVM_ASM           __asm__
  64: #else
  65: #define LLVM_NAN(NanStr)   ((double)0.0)           /* Double */
  66: #define LLVM_NANF(NanStr)  0.0F                    /* Float */
  67: #define LLVM_NANS(NanStr)  ((double)0.0)           /* Double */
  68: #define LLVM_NANSF(NanStr) 0.0F                    /* Float */
  69: #define LLVM_INF           ((double)0.0)           /* Double */
  70: #define LLVM_INFF          0.0F                    /* Float */
  71: #define LLVM_PREFETCH(addr,rw,locality)            /* PREFETCH */
  72: #define __ATTRIBUTE_CTOR__
  73: #define __ATTRIBUTE_DTOR__
  74: #define LLVM_ASM(X)
  75: #endif
  76:  
  77: #if __GNUC__ < 4 /* Old GCC's, or compilers not GCC */ 
  78: #define __builtin_stack_save() 0   /* not implemented */
  79: #define __builtin_stack_restore(X) /* noop */
  80: #endif
  81:  
  82: #define CODE_FOR_MAIN() /* Any target-specific code for main()*/
  83:  
  84: #ifndef __cplusplus
  85: typedef unsigned char bool;
  86: #endif
  87:  
  88:  
  89: /* Support for floating point constants */
  90: typedef unsigned long long ConstantDoubleTy;
  91: typedef unsigned int        ConstantFloatTy;
  92: typedef struct { unsigned long long f1; unsigned short f2; unsigned short pad[3]; } ConstantFP80Ty;
  93: typedef struct { unsigned long long f1; unsigned long long f2; } ConstantFP128Ty;
  94:  
  95:  
  96: /* Global Declarations */
  97: /* Helper union for bitcasts */
  98: typedef union {
  99:   unsigned int Int32;
 100:   unsigned long long Int64;
 101:   float Float;
 102:   double Double;
 103: } llvmBitCastUnion;
 104:  
 105: /* External Global Variable Declarations */
 106:  
 107: /* Function Declarations */
 108: double fmod(double, double);
 109: float fmodf(float, float);
 110: long double fmodl(long double, long double);
 111: unsigned int main(void);
 112: unsigned int puts(unsigned char *);
 113: unsigned char *malloc();
 114: void free(unsigned char *);
 115: void abort(void);
 116:  
 117:  
 118: /* Global Variable Declarations */
 119: static unsigned char _2E_str[12];
 120:  
 121:  
 122: /* Global Variable Definitions and Initialization */
 123: static unsigned char _2E_str[12] = "hello world";
 124:  
 125:  
 126: /* Function Bodies */
 127: static inline int llvm_fcmp_ord(double X, double Y) { return X == X && Y == Y; }
 128: static inline int llvm_fcmp_uno(double X, double Y) { return X != X || Y != Y; }
 129: static inline int llvm_fcmp_ueq(double X, double Y) { return X == Y || llvm_fcmp_uno(X, Y); }
 130: static inline int llvm_fcmp_une(double X, double Y) { return X != Y; }
 131: static inline int llvm_fcmp_ult(double X, double Y) { return X <  Y || llvm_fcmp_uno(X, Y); }
 132: static inline int llvm_fcmp_ugt(double X, double Y) { return X >  Y || llvm_fcmp_uno(X, Y); }
 133: static inline int llvm_fcmp_ule(double X, double Y) { return X <= Y || llvm_fcmp_uno(X, Y); }
 134: static inline int llvm_fcmp_uge(double X, double Y) { return X >= Y || llvm_fcmp_uno(X, Y); }
 135: static inline int llvm_fcmp_oeq(double X, double Y) { return X == Y ; }
 136: static inline int llvm_fcmp_one(double X, double Y) { return X != Y && llvm_fcmp_ord(X, Y); }
 137: static inline int llvm_fcmp_olt(double X, double Y) { return X <  Y ; }
 138: static inline int llvm_fcmp_ogt(double X, double Y) { return X >  Y ; }
 139: static inline int llvm_fcmp_ole(double X, double Y) { return X <= Y ; }
 140: static inline int llvm_fcmp_oge(double X, double Y) { return X >= Y ; }
 141:  
 142: unsigned int main(void) {
 143:   unsigned int llvm_cbe_tmp2;
 144:  
 145:   CODE_FOR_MAIN();
 146:   llvm_cbe_tmp2 =  /*tail*/ puts((&(_2E_str[((signed int )((unsigned int )0))])));
 147:   return ((unsigned int )0);
 148: }

Granted, it's some ugly-looking C code, with all those preprocessor fragments floating around in there, but if you take a few moments and go down to the main() definition, it's C to bitcode to C. We've come full circle.

Looking back at that first disassembly dump, I'm struck by how LLVM bitcode looks a lot like any other high-level assembly or low-level virtual machine language, even reminiscent of MSIL. In fact, there's probably a pretty close correlation between LLVM bitcode and MSIL.

In point of fact, LLVM knows this, too:

$ llc hello.bc -o hello.bc.il -march msil

And check out what it generates:

   1: .assembly extern mscorlib {}
   2: .assembly MSIL {}
   3:  
   4: // External
   5: .method static hidebysig pinvokeimpl("MSVCRT.DLL")
   6:     unsigned int32 modopt([mscorlib]System.Runtime.CompilerServices.CallConvCdecl) 'puts'(void* ) preservesig {}
   7:  
   8: .method static hidebysig pinvokeimpl("MSVCRT.DLL")
   9:     vararg void* modopt([mscorlib]System.Runtime.CompilerServices.CallConvCdecl) 'malloc'() preservesig {}
  10:  
  11: .method static hidebysig pinvokeimpl("MSVCRT.DLL")
  12:     void modopt([mscorlib]System.Runtime.CompilerServices.CallConvCdecl) 'free'(void* ) preservesig {}
  13:  
  14: .method public hidebysig static pinvokeimpl("KERNEL32.DLL" ansi winapi)  native int LoadLibrary(string) preservesig {}
  15: .method public hidebysig static pinvokeimpl("KERNEL32.DLL" ansi winapi)  native int GetProcAddress(native int, string) preservesig {}
  16: .method private static void* $MSIL_Import(string lib,string sym)
  17:  managed cil
  18: {
  19:     ldarg    lib
  20:     call    native int LoadLibrary(string)
  21:     ldarg    sym
  22:     call    native int GetProcAddress(native int,string)
  23:     dup
  24:     brtrue    L_01
  25:     ldstr    "Can no import variable"
  26:     newobj    instance void [mscorlib]System.Exception::.ctor(string)
  27:     throw
  28: L_01:
  29:     ret
  30: }
  31:  
  32: .method static private void $MSIL_Init() managed cil
  33: {
  34:     ret
  35: }
  36:  
  37: // Declarations
  38: .class value explicit ansi sealed 'unsigned int8 [12]' { .pack 1 .size 12 }
  39:  
  40: // Definitions
  41: .field static private valuetype 'unsigned int8 [12]' '.str' at '.str$data'
  42: .data '.str$data' = {
  43: int8 (104),
  44: int8 (101),
  45: int8 (108),
  46: int8 (108),
  47: int8 (111),
  48: int8 (32),
  49: int8 (119),
  50: int8 (111),
  51: int8 (114),
  52: int8 (108),
  53: int8 (100),
  54: int8 (0) [1]
  55: }
  56:  
  57: // Startup code
  58: .method static public int32 $MSIL_Startup() {
  59:     .entrypoint
  60:     .locals (native int i)
  61:     .locals (native int argc)
  62:     .locals (native int ptr)
  63:     .locals (void* argv)
  64:     .locals (string[] args)
  65:     call    string[] [mscorlib]System.Environment::GetCommandLineArgs()
  66:     dup
  67:     stloc    args
  68:     ldlen
  69:     conv.i4
  70:     dup
  71:     stloc    argc
  72:     ldc.i4    4
  73:     mul
  74:     localloc
  75:     stloc    argv
  76:     ldc.i4.0
  77:     stloc    i
  78: L_01:
  79:     ldloc    i
  80:     ldloc    argc
  81:     ceq
  82:     brtrue    L_02
  83:     ldloc    args
  84:     ldloc    i
  85:     ldelem.ref
  86:     call    native int [mscorlib]System.Runtime.InteropServices.Marshal::StringToHGlobalAnsi(string)
  87:     stloc    ptr
  88:     ldloc    argv
  89:     ldloc    i
  90:     ldc.i4    4
  91:     mul
  92:     add
  93:     ldloc    ptr
  94:     stind.i
  95:     ldloc    i
  96:     ldc.i4.1
  97:     add
  98:     stloc    i
  99:     br    L_01
 100: L_02:
 101:     call void $MSIL_Init()
 102:     call    unsigned int32 modopt([mscorlib]System.Runtime.CompilerServices.CallConvCdecl) main()
 103:     conv.i4
 104:     ret
 105: }
 106:  
 107: .method static public unsigned int32 modopt([mscorlib]System.Runtime.CompilerServices.CallConvCdecl) 'main'
 108:     () cil managed
 109: {
 110:     .locals (unsigned int32 'ltmp_0_1')
 111:     .maxstack    16
 112: ltmp_1_2:
 113:  
 114: //    %tmp2 = tail call i32 @puts( i8* getelementptr ([12 x i8]* @.str, i32 0, i32 0) )        ; <i32> [#uses=0]
 115:  
 116:     ldsflda    valuetype 'unsigned int8 [12]' '.str'
 117:     conv.u4
 118:     call    unsigned int32 modopt([mscorlib]System.Runtime.CompilerServices.CallConvCdecl) 'puts'(void* )
 119:     stloc    'ltmp_0_1'
 120:  
 121: //    ret i32 0
 122:  
 123:     ldc.i4    0
 124:     ret
 125: }

Holy frickin' crap. I think I'm in love.