C++ 메모리 정렬 | Alignment·Padding·False Sharing 완벽 정리

2026년 3월 12일 · 30분 읽기 · 수정 2026년 3월 31일 고급 튜토리얼

이 글의 핵심

C++ 메모리 정렬, 패딩, alignas, alignof, False Sharing 방지, 구조체 최적화를 실전 예제와 함께 정리합니다.

들어가며

메모리 정렬(Alignment) 은 CPU가 메모리를 효율적으로 읽고 쓰기 위해 요구하는 주소 경계입니다. 컴파일러는 구조체 멤버 사이에 패딩(Padding) 을 삽입해 정렬을 맞추며, 이는 메모리 크기와 성능에 직접적인 영향을 미칩니다.

이 글을 읽으면

alignof, alignas로 정렬을 확인하고 제어합니다
구조체 멤버 순서를 최적화해 메모리를 절약합니다
False Sharing을 방지해 멀티스레드 성능을 개선합니다
SIMD, 캐시 라인 최적화 등 고급 패턴을 익힙니다

메모리 정렬 기본

정렬이란?

CPU는 타입마다 읽기·쓰기가 허용되는 시작 주소(정렬 경계) 가 정해져 있습니다. 예를 들어, int는 4바이트 경계(주소가 4의 배수)에서 시작해야 효율적이며, 일부 CPU는 정렬되지 않은 접근을 금지하거나 성능 저하를 일으킵니다.

타입별 정렬 요구사항

#include <iostream>
using namespace std;

int main() {
    cout << "char: " << alignof(char) << endl;      // 1
    cout << "short: " << alignof(short) << endl;    // 2
    cout << "int: " << alignof(int) << endl;        // 4
    cout << "long: " << alignof(long) << endl;      // 4 (Windows) / 8 (Linux)
    cout << "double: " << alignof(double) << endl;  // 8
    cout << "int*: " << alignof(int*) << endl;      // 8 (64비트)
    
    return 0;
}

패딩이 생기는 이유

컴파일러는 각 멤버를 정렬 경계에 맞추기 위해 빈 바이트(패딩)를 삽입합니다.

struct Bad {
    char c;    // 주소 0 (1 byte)
    // 3 bytes padding (주소 1~3)
    int i;     // 주소 4 (4 bytes)
    // 4 bytes padding (주소 8~11)
    double d;  // 주소 12 (8 bytes)
};  // 총 24 bytes

실전 구현

1) 구조체 패딩 최적화

비효율적 배치

#include <iostream>
using namespace std;

struct Bad {
    char c;    // 1 byte
    // 3 bytes padding
    int i;     // 4 bytes
    // 4 bytes padding
    double d;  // 8 bytes
};  // 총 24 bytes

int main() {
    cout << "Bad: " << sizeof(Bad) << endl;  // 24
    
    return 0;
}

최적화 배치

struct Good {
    double d;  // 8 bytes
    int i;     // 4 bytes
    char c;    // 1 byte
    // 3 bytes padding
};  // 총 16 bytes

int main() {
    cout << "Good: " << sizeof(Good) << endl;  // 16
    
    return 0;
}

최적화 원칙

큰 타입을 먼저 배치 (double → int → char)
같은 크기 타입을 그룹화
패딩을 최소화

struct Best {
    double d1;  // 8 bytes
    double d2;  // 8 bytes
    int i1;     // 4 bytes
    int i2;     // 4 bytes
    char c1;    // 1 byte
    char c2;    // 1 byte
    char c3;    // 1 byte
    char c4;    // 1 byte
};  // 총 32 bytes (패딩 없음)

int main() {
    cout << "Best: " << sizeof(Best) << endl;  // 32
    
    return 0;
}

2) alignas - 정렬 지정

시그니처:

alignas(alignment) type name;

구조체 정렬

#include <iostream>
using namespace std;

struct alignas(16) Aligned {
    int x;
    int y;
};

int main() {
    cout << "정렬: " << alignof(Aligned) << endl;  // 16
    cout << "크기: " << sizeof(Aligned) << endl;   // 16
    
    return 0;
}

변수 정렬

#include <iostream>

int main() {
    alignas(64) int cacheLine[16];  // 64바이트 정렬
    
    cout << "주소: " << (uintptr_t)cacheLine << endl;
    // 64의 배수
    
    return 0;
}

3) 패딩 제거 (pragma pack)

주의: 성능 저하, undefined behavior 가능

#include <iostream>
using namespace std;

#pragma pack(push, 1)
struct Packed {
    char c;    // 1 byte
    int i;     // 4 bytes
    double d;  // 8 bytes
};  // 총 13 bytes (패딩 없음)
#pragma pack(pop)

int main() {
    cout << "Packed: " << sizeof(Packed) << endl;  // 13
    
    Packed p;
    p.i = 10;  // 정렬되지 않은 접근 (느림 또는 크래시)
    
    return 0;
}

사용 시나리오:

네트워크 프로토콜 (패킷 구조)
파일 포맷 (바이너리 직렬화)
하드웨어 인터페이스 (레지스터 맵)

고급 활용

False Sharing: 여러 스레드가 같은 캐시 라인의 다른 변수를 수정하여 성능 저하

문제 코드

#include <atomic>
#include <thread>
#include <vector>
#include <chrono>
#include <iostream>

struct Counters {
    std::atomic<int> counter1;  // 0-3 bytes
    std::atomic<int> counter2;  // 4-7 bytes
};  // 같은 캐시 라인 (64 bytes)

int main() {
    Counters counters;
    
    auto start = std::chrono::high_resolution_clock::now();
    
    std::thread t1([&]() {
        for (int i = 0; i < 10000000; ++i) {
            counters.counter1++;
        }
    });
    
    std::thread t2([&]() {
        for (int i = 0; i < 10000000; ++i) {
            counters.counter2++;
        }
    });
    
    t1.join();
    t2.join();
    
    auto end = std::chrono::high_resolution_clock::now();
    auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
    
    std::cout << "False Sharing: " << duration << "ms" << std::endl;
    // 약 500ms
    
    return 0;
}

해결 코드

struct CountersAligned {
    alignas(64) std::atomic<int> counter1;
    alignas(64) std::atomic<int> counter2;
};  // 각각 다른 캐시 라인

int main() {
    CountersAligned counters;
    
    auto start = std::chrono::high_resolution_clock::now();
    
    std::thread t1([&]() {
        for (int i = 0; i < 10000000; ++i) {
            counters.counter1++;
        }
    });
    
    std::thread t2([&]() {
        for (int i = 0; i < 10000000; ++i) {
            counters.counter2++;
        }
    });
    
    t1.join();
    t2.join();
    
    auto end = std::chrono::high_resolution_clock::now();
    auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
    
    std::cout << "No False Sharing: " << duration << "ms" << std::endl;
    // 약 150ms (3배 개선)
    
    return 0;
}

2) SIMD 정렬

SSE/AVX는 16/32바이트 정렬 필요

#include <immintrin.h>
#include <iostream>

int main() {
    // ❌ 정렬 안됨
    float data1[8];
    // __m256 a = _mm256_load_ps(data1);  // 크래시 가능
    
    // ✅ 32바이트 정렬
    alignas(32) float data2[8] = {1, 2, 3, 4, 5, 6, 7, 8};
    __m256 a = _mm256_load_ps(data2);  // 안전
    
    // 연산
    __m256 b = _mm256_set1_ps(2.0f);
    __m256 c = _mm256_mul_ps(a, b);
    
    // 결과 저장
    alignas(32) float result[8];
    _mm256_store_ps(result, c);
    
    for (float x : result) {
        std::cout << x << " ";  // 2 4 6 8 10 12 14 16
    }
    
    return 0;
}

3) 정렬된 메모리 할당

#include <cstdlib>
#include <iostream>

template<typename T, size_t Alignment = alignof(T)>
class AlignedAllocator {
public:
    using value_type = T;
    
    T* allocate(size_t n) {
        void* ptr = nullptr;
        
        #ifdef _WIN32
            ptr = _aligned_malloc(n * sizeof(T), Alignment);
        #else
            if (posix_memalign(&ptr, Alignment, n * sizeof(T)) != 0) {
                ptr = nullptr;
            }
        #endif
        
        if (!ptr) {
            throw std::bad_alloc();
        }
        
        return static_cast<T*>(ptr);
    }
    
    void deallocate(T* ptr, size_t) noexcept {
        #ifdef _WIN32
            _aligned_free(ptr);
        #else
            free(ptr);
        #endif
    }
};

int main() {
    AlignedAllocator<double, 64> allocator;
    
    double* data = allocator.allocate(100);
    
    std::cout << "주소: " << (uintptr_t)data << std::endl;
    // 64의 배수
    
    allocator.deallocate(data, 100);
    
    return 0;
}

성능 비교

정렬된 접근 vs 정렬되지 않은 접근

테스트: 1억 번 int 읽기

접근 방식	시간	배속
정렬된 접근 (4바이트 경계)	50ms	1x
정렬되지 않은 접근 (1바이트 경계)	200ms	0.25x

결론: 정렬된 접근이 4배 빠름

테스트: 2개 스레드, 각 1천만 번 증가

구조	시간	배속
False Sharing (같은 캐시 라인)	500ms	1x
캐시 라인 분리 (alignas(64))	150ms	3.3x

결론: 캐시 라인 분리로 3배 개선

구조체 크기 비교

struct Bad {
    char c;    // 1 + 3 padding
    int i;     // 4 + 4 padding
    double d;  // 8
};  // 24 bytes

struct Good {
    double d;  // 8
    int i;     // 4
    char c;    // 1 + 3 padding
};  // 16 bytes

결론: 멤버 순서 최적화로 33% 절약

실무 사례

#include <atomic>
#include <thread>
#include <vector>
#include <iostream>

struct alignas(64) AlignedCounter {
    std::atomic<int> counter;
    char padding[60];  // 64바이트 채우기
};

int main() {
    AlignedCounter counters[4];
    
    std::vector<std::thread> threads;
    for (int i = 0; i < 4; ++i) {
        threads.emplace_back([&, i]() {
            for (int j = 0; j < 1000000; ++j) {
                counters[i].counter++;
            }
        });
    }
    
    for (auto& t : threads) {
        t.join();
    }
    
    for (int i = 0; i < 4; ++i) {
        std::cout << "Counter " << i << ": " << counters[i].counter << std::endl;
    }
    
    return 0;
}

사례 2: SIMD 벡터 연산

#include <immintrin.h>
#include <iostream>

void vectorAdd(const float* a, const float* b, float* c, size_t n) {
    for (size_t i = 0; i < n; i += 8) {
        __m256 va = _mm256_load_ps(a + i);
        __m256 vb = _mm256_load_ps(b + i);
        __m256 vc = _mm256_add_ps(va, vb);
        _mm256_store_ps(c + i, vc);
    }
}

int main() {
    alignas(32) float a[8] = {1, 2, 3, 4, 5, 6, 7, 8};
    alignas(32) float b[8] = {8, 7, 6, 5, 4, 3, 2, 1};
    alignas(32) float c[8];
    
    vectorAdd(a, b, c, 8);
    
    for (float x : c) {
        std::cout << x << " ";  // 9 9 9 9 9 9 9 9
    }
    
    return 0;
}

사례 3: 네트워크 프로토콜 - 패딩 제거

#include <cstdint>
#include <iostream>

#pragma pack(push, 1)
struct PacketHeader {
    uint8_t version;     // 1 byte
    uint16_t length;     // 2 bytes
    uint32_t sequence;   // 4 bytes
    uint64_t timestamp;  // 8 bytes
};  // 총 15 bytes (패딩 없음)
#pragma pack(pop)

int main() {
    std::cout << "PacketHeader: " << sizeof(PacketHeader) << std::endl;  // 15
    
    PacketHeader header;
    header.version = 1;
    header.length = 100;
    header.sequence = 12345;
    header.timestamp = 1234567890;
    
    // 네트워크로 전송
    // send(socket, &header, sizeof(header), 0);
    
    return 0;
}

사례 4: 게임 엔진 - 데이터 지향 설계

#include <vector>
#include <iostream>

// ❌ AoS (Array of Structures) - 캐시 미스 많음
struct EntityAoS {
    float x, y, z;      // 위치
    float vx, vy, vz;   // 속도
    int health;
    int id;
};

std::vector<EntityAoS> entitiesAoS(10000);

// ✅ SoA (Structure of Arrays) - 캐시 친화적
struct EntitiesSoA {
    std::vector<float> x, y, z;
    std::vector<float> vx, vy, vz;
    std::vector<int> health;
    std::vector<int> id;
};

void updatePositions(EntitiesSoA& entities, float dt) {
    for (size_t i = 0; i < entities.x.size(); ++i) {
        entities.x[i] += entities.vx[i] * dt;
        entities.y[i] += entities.vy[i] * dt;
        entities.z[i] += entities.vz[i] * dt;
    }
}

int main() {
    EntitiesSoA entities;
    entities.x.resize(10000);
    entities.y.resize(10000);
    entities.z.resize(10000);
    entities.vx.resize(10000, 1.0f);
    entities.vy.resize(10000, 1.0f);
    entities.vz.resize(10000, 1.0f);
    
    updatePositions(entities, 0.016f);
    
    std::cout << "위치 업데이트 완료" << std::endl;
    
    return 0;
}

트러블슈팅

문제 1: 정렬되지 않은 접근

증상: 크래시 또는 성능 저하

// ❌ 정렬 안됨
char buffer[100];
int* ptr = reinterpret_cast<int*>(buffer + 1);
*ptr = 10;  // 정렬 안됨 (느림 또는 크래시)

// ✅ 정렬 보장
alignas(int) char buffer[100];
int* ptr = reinterpret_cast<int*>(buffer);
*ptr = 10;

문제 2: 구조체 크기 가정

증상: 직렬화 오류, 메모리 계산 오류

struct Data {
    char c;
    int i;
};

// ❌ 잘못된 가정
// sizeof(Data) == 5라고 가정 (실제는 8)

// ✅ sizeof 사용
size_t size = sizeof(Data);  // 8

// ✅ static_assert로 검증
static_assert(sizeof(Data) == 8, "Data size mismatch");

문제 3: 플랫폼별 차이

증상: Windows와 Linux에서 다른 크기

struct Data {
    long l;
    int i;
};

// Windows 64비트: sizeof(long) == 4
// Linux 64비트: sizeof(long) == 8

// ✅ 고정 크기 타입 사용
#include <cstdint>

struct DataFixed {
    int64_t l;  // 항상 8 bytes
    int32_t i;  // 항상 4 bytes
};

문제 4: SIMD 정렬 오류

증상: _mm256_load_ps 크래시

// ❌ 정렬 안됨
float data[8];
__m256 a = _mm256_load_ps(data);  // 크래시!

// ✅ 32바이트 정렬
alignas(32) float data[8];
__m256 a = _mm256_load_ps(data);  // 안전

// 또는 정렬되지 않은 로드 사용
__m256 a = _mm256_loadu_ps(data);  // 느리지만 안전

마무리

C++ 메모리 정렬은 성능과 메모리 효율성에 직접적인 영향을 미칩니다.

핵심 요약

정렬 기본
- CPU는 타입별 정렬 경계 요구
- 컴파일러는 패딩을 삽입해 정렬 맞춤
- alignof로 확인, alignas로 제어
구조체 최적화
- 큰 타입을 먼저 배치
- 같은 크기 타입을 그룹화
- 패딩을 최소화
False Sharing 방지
- 캐시 라인(64 bytes) 분리
- alignas(64) 사용
- 멀티스레드 성능 3배 개선
SIMD 최적화
- SSE: 16바이트 정렬
- AVX: 32바이트 정렬
- _mm256_load_ps vs _mm256_loadu_ps

선택 가이드

상황	방법
구조체 크기 줄이기	큰 타입 먼저 배치
멀티스레드 카운터	`alignas(64)`
SIMD 연산	`alignas(32)`
네트워크 프로토콜	`#pragma pack(1)`

코드 예제 치트시트

// 정렬 확인
cout << alignof(int) << endl;

// 크기 확인
cout << sizeof(MyStruct) << endl;

// 정렬 지정
alignas(64) int cacheLine[16];

// 구조체 정렬
struct alignas(16) Aligned { int x, y; };

// 패딩 제거 (주의!)
#pragma pack(push, 1)
struct Packed { char c; int i; };
#pragma pack(pop)

// SIMD 정렬
alignas(32) float data[8];
__m256 a = _mm256_load_ps(data);

다음 단계

캐시 최적화: C++ 캐시 최적화
Data-Oriented Design: C++ 캐시와 Data-Oriented Design
캐시 친화적 코드: C++ 캐시 친화적 코드 작성법

참고 자료

“What Every Programmer Should Know About Memory” - Ulrich Drepper
cppreference: https://en.cppreference.com/w/cpp/language/object#Alignment
Intel Intrinsics Guide: https://www.intel.com/content/www/us/en/docs/intrinsics-guide/

한 줄 정리: 메모리 정렬은 성능과 직결되며, 구조체 멤버 순서 최적화와 False Sharing 방지로 멀티스레드 성능을 크게 개선할 수 있다.