본문으로 건너뛰기 C++ Inline Assembly | asm Keyword Guide

C++ Inline Assembly | asm Keyword Guide

C++ Inline Assembly | asm Keyword Guide

이 글의 핵심

Inline assembly (`asm`) lets you embed assembly inside C++ to use architecture-specific instructions. This article covers GCC/Clang AT&T syntax vs MSVC Intel syntax, constraints, and alternatives—with examples.

Basic syntax

Inline assembly is usually considered only when you hit a performance bottleneck or need special instructions. This article lists examples so you can learn compiler-specific syntax differences and recognize where portability is easily lost.

GCC/Clang (AT&T syntax)

Example main implementation:

int main() {
    int x = 10;
    int y = 20;
    int result;
    
    asm("addl %1, %0"
        : "=r" (result)  // output
        : "r" (x), "0" (y)  // input
    );
    
    cout << result << endl;  // 30
}

MSVC (Intel syntax)

Example main implementation:

int main() {
    int x = 10;
    int y = 20;
    int result;
    
    __asm {
        mov eax, x
        add eax, y
        mov result, eax
    }
    
    cout << result << endl;  // 30
}

Register constraints

Example add implementation:

int add(int a, int b) {
    int result;
    
    asm("addl %2, %0"
        : "=r" (result)      // output: any register
        : "0" (a), "r" (b)   // input
    );
    
    return result;
}

// Constraint letters:
// r: general-purpose register
// a: %eax/%rax
// b: %ebx/%rbx
// c: %ecx/%rcx
// d: %edx/%rdx
// m: memory
// i: immediate

Practical examples

Example 1: CPUID

#include <iostream>
using namespace std;

void cpuid(int code, int* a, int* b, int* c, int* d) {
    asm volatile("cpuid"
        : "=a"(*a), "=b"(*b), "=c"(*c), "=d"(*d)
        : "a"(code)
    );
}

int main() {
    int a, b, c, d;
    cpuid(0, &a, &b, &c, &d);
    
    char vendor[13];
    *(int*)(vendor) = b;
    *(int*)(vendor + 4) = d;
    *(int*)(vendor + 8) = c;
    vendor[12] = '\0';
    
    cout << "CPU: " << vendor << endl;
}

Example 2: Atomic operation

int atomicIncrement(int* ptr) {
    int result;
    
    asm volatile(
        "lock; xaddl %0, %1"
        : "=r" (result), "+m" (*ptr)
        : "0" (1)
        : "memory"
    );
    
    return result;
}

int main() {
    int counter = 0;
    
    for (int i = 0; i < 10; i++) {
        atomicIncrement(&counter);
    }
    
    cout << counter << endl;  // 10
}

Example 3: Timestamp counter

uint64_t rdtsc() {
    uint32_t lo, hi;
    
    asm volatile("rdtsc"
        : "=a"(lo), "=d"(hi)
    );
    
    return ((uint64_t)hi << 32) | lo;
}

int main() {
    uint64_t start = rdtsc();
    
    // Code to measure
    for (int i = 0; i < 1000000; i++) {
        // ...
    }
    
    uint64_t end = rdtsc();
    
    cout << "Cycles: " << (end - start) << endl;
}

Example 4: Memory barriers

Example memoryBarrier implementation:

void memoryBarrier() {
    asm volatile("mfence" ::: "memory");
}

void compilerBarrier() {
    asm volatile("" ::: "memory");
}

// Usage
atomic<bool> ready(false);
int data = 0;

void producer() {
    data = 42;
    compilerBarrier();  // prevent reordering
    ready.store(true, memory_order_release);
}

volatile

Example main implementation:

int main() {
    int x = 10;
    
    // volatile: prevent optimization
    asm volatile("nop");  // not eliminated
    
    // Memory clobber
    asm volatile("" ::: "memory");  // prevent memory reordering
}

Platform differences

x86-64

// 64-bit registers
asm("movq %0, %%rax" : : "r"(value));

ARM

// ARM syntax
asm("mov r0, %0" : : "r"(value));

Cross-platform

#ifdef __x86_64__
    asm("rdtsc" : "=a"(lo), "=d"(hi));
#elif __aarch64__
    asm("mrs %0, cntvct_el0" : "=r"(cycles));
#else
    #error "Unsupported platform"
#endif

Common pitfalls

Pitfall 1: Register clobbering

C/C++ example:

// ❌ Register clobber
asm("movl $10, %eax");  // clobbers eax

// ✅ Specify clobbers
asm("movl $10, %%eax"
    :
    :
    : "%eax"  // eax is clobbered
);

Pitfall 2: Optimization interference

// ❌ Removed by optimization
asm("nop");

// ✅ Use volatile
asm volatile("nop");

Pitfall 3: Platform dependence

C/C++ example:

// ❌ x86-only
asm("rdtsc" : "=a"(lo), "=d"(hi));

// ✅ Conditional compilation
#ifdef __x86_64__
    asm("rdtsc" : "=a"(lo), "=d"(hi));
#else
    // Alternative implementation
#endif

Inline assembly vs intrinsics

C/C++ example:

// Inline assembly
asm("addl %1, %0" : "=r"(result) : "r"(a), "0"(b));

// Intrinsics (preferred)
#include <x86intrin.h>
result = _mm_add_epi32(a, b);

Benefits of intrinsics:

  • Type-safe
  • Optimizable
  • More portable (compiler lowers to the right instructions)

Debugging

Example func implementation:

// Assembly output
void func() {
    int x = 10;
    int y = x * 2;
}

// Compile:
// g++ -S -O2 program.cpp
// Inspect program.s

FAQ

Q1: When should I use inline assembly?

A:

  • Extreme optimization
  • Direct hardware access
  • Special instructions (CPUID, RDTSC)

Q2: Intrinsics vs inline assembly?

A: Prefer intrinsics when possible—they are safer and more portable.

Q3: Is a performance win guaranteed?

A: No. The compiler’s optimizations may be better.

Q4: How do I keep things portable?

A:

  • Conditional compilation
  • Use intrinsics
  • Assembly as a last resort

Q5: How do I debug?

A:

Q6: Learning resources for inline assembly?

A:

  • GCC inline assembly documentation
  • Intel/AMD manuals
  • PC Assembly Language (Paul Carter)

Other posts that connect to this topic: