본문 바로가기

카테고리 없음

최종 피드백용

#pragma once
#ifndef _PCDF_H
# define _PCDF_H

#include <pthread.h>
#include <immintrin.h> // AVX2 intrinsics
#include <omp.h>
#include <sys/mman.h> // For JIT memory allocation
#include <stdint.h>
#include <string.h>

define PROCEEDING(x) [](autolike x)
#define pcdef PROCEEDING
//#define REG register
#define REG
//#define LIKELY(x) __builtin_expect((x),1)
#define LIKELY(x) x
#define FORSE_ __attribute__((always_inline))
//#define RST restrict
#define RST
//#define OPTB B = __builtin_assume_aligned(B, 64);
#define OPTB

typedef void *autolike;
typedef void (*proceedings)(autolike);
typedef proceedings pcd;
typedef pcd *proceed_ptr;
typedef proceed_ptr pcdx;

typedef struct {
    pcdx workflow;
    pcdx ends;
} __scadule__;

typedef struct {
    __scadule__ list;
    autolike argument;
} scadule;

typedef scadule scad;


#define DEFINE_RUNC(hex, openmp, pthread, avx2, vector) \
FORSE_ inline void _runc_##hex(autolike target) { \
    REG scad *A = (scad *)target; \
    REG autolike argv = A->argument; \
    REG __scadule__ *RST B = &A->list; \
    OPTB \
    REG pcdx iter = B->workflow; \
    REG pcdx *ends = &B->ends; \
    if (openmp) { \
        #pragma omp parallel for \
        for (int i = 0; iter + i <= *ends; i++) { \
            (*(iter + i))(argv); \
        } \
    } else if (pthread) { \
        pthread_t threads[16]; \
        int i = 0; \
        while (iter <= *ends) { \
            pthread_create(&threads[i], NULL, (void *(*)(void *))(*iter), argv); \
            iter++; \
            i++; \
        } \
        for (int j = 0; j < i; j++) { \
            pthread_join(threads[j], NULL); \
        } \
    } else if (avx2) { \
        __m256i vec_arg = _mm256_set1_epi64x((long long)argv); \
        while (iter <= *ends) { \
            (*iter)((autolike)_mm256_extract_epi64(vec_arg, 0)); \
            iter++; \
        } \
    } else if (vector) { \
        #pragma GCC ivdep \
        for (; LIKELY(iter <= *ends); iter++) { \
            (*iter)(argv); \
        } \
    } else { \
        for (; LIKELY(iter <= *ends); iter++) { \
            (*iter)(argv); \
        } \
    } \
}

DEFINE_RUNC(0, 0, 0, 0, 0)
DEFINE_RUNC(1, 0, 0, 0, 1)
DEFINE_RUNC(2, 0, 0, 1, 0)
DEFINE_RUNC(3, 0, 0, 1, 1)
DEFINE_RUNC(4, 0, 1, 0, 0)
DEFINE_RUNC(5, 0, 1, 0, 1)
DEFINE_RUNC(6, 0, 1, 1, 0)
DEFINE_RUNC(7, 0, 1, 1, 1)
DEFINE_RUNC(8, 1, 0, 0, 0)
DEFINE_RUNC(9, 1, 0, 0, 1)
DEFINE_RUNC(A, 1, 0, 1, 0)
DEFINE_RUNC(B, 1, 0, 1, 1)
DEFINE_RUNC(C, 1, 1, 0, 0)
DEFINE_RUNC(D, 1, 1, 0, 1)
DEFINE_RUNC(E, 1, 1, 1, 0)
DEFINE_RUNC(F, 1, 1, 1, 1)

// Function to generate JIT compiled function for direct function calls
static void (*generate_jit_function(pcd target_func))(autolike) {
    uint8_t *code = mmap(NULL, 64, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
    if (code == MAP_FAILED) return NULL;

    // x86-64 assembly to call function directly
    uint8_t assembly[] = {
        0x48, 0x89, 0xF7,       // mov rdi, rsi (move argument to correct register)
        0x48, 0xB8,             // movabs rax, <target_func>
        0, 0, 0, 0, 0, 0, 0, 0, // (address placeholder)
        0xFF, 0xE0              // jmp rax (jump to function)
    };
    memcpy(&assembly[5], &target_func, sizeof(target_func));
    memcpy(code, assembly, sizeof(assembly));
    return (void (*)(autolike))code;
}

#define DEFINE_EXPERIMENTIAL_RUNC(hex, openmp, pthread, avx2, vector) \
FORSE_ inline void _experiemtial_runc_##hex(autolike target) { \
    REG scad *A = (scad *)target; \
    REG autolike argv = A->argument; \
    REG __scadule__ *RST B = &A->list; \
    OPTB \
    REG pcdx iter = B->workflow; \
    REG pcdx *ends = &B->ends; \
    \
    register autolike fixed_argv asm("rdi") = argv; /* Fix argv in rdi register */ \
    \
    if (openmp) { \
        #pragma omp parallel for \
        for (int i = 0; iter + i <= *ends; i++) { \
            pcd func = iter[i]; \
            __asm__ volatile("" ::: "rdi"); /* Tell compiler not to touch rdi */ \
            func(fixed_argv); \
        } \
    } else if (pthread) { \
        pthread_t threads[16]; \
        int i = 0; \
        while (iter <= *ends) { \
            pcd func = generate_jit_function(*iter); /* Use JIT compiled function */ \
            __asm__ volatile("" ::: "rdi"); \
            pthread_create(&threads[i], NULL, (void *(*)(void *))func, fixed_argv); \
            iter++; \
            i++; \
        } \
        for (int j = 0; j < i; j++) { \
            pthread_join(threads[j], NULL); \
        } \
    } else if (avx2) { \
        __m256i vec_arg = _mm256_set1_epi64x((long long)fixed_argv); \
        while (iter <= *ends) { \
            pcd func = generate_jit_function(*iter); /* Use JIT compiled function */ \
            __asm__ volatile("" ::: "rdi"); \
            func((autolike)_mm256_extract_epi64(vec_arg, 0)); \
            iter++; \
        } \
    } else if (vector) { \
        #pragma GCC ivdep \
        for (; LIKELY(iter <= *ends); iter++) { \
            __builtin_prefetch(iter + 4, 0, 1); /* Prefetch function pointer */ \
            pcd func = generate_jit_function(*iter); /* Use JIT compiled function */ \
            __asm__ volatile("" ::: "rdi"); \
            func(fixed_argv); \
        } \
    } else { \
        for (; LIKELY(iter <= *ends); iter++) { \
            __builtin_prefetch(iter + 4, 0, 1); /* Prefetch function pointer */ \
            pcd func = generate_jit_function(*iter); /* Use JIT compiled function */ \
            __asm__ volatile("" ::: "rdi"); \
            func(fixed_argv); \
        } \
    } \
}

DEFINE_EXPERIMENTIAL_RUNC(0, 0, 0, 0, 0)
DEFINE_EXPERIMENTIAL_RUNC(1, 0, 0, 0, 1)
DEFINE_EXPERIMENTIAL_RUNC(2, 0, 0, 1, 0)
DEFINE_EXPERIMENTIAL_RUNC(3, 0, 0, 1, 1)
DEFINE_EXPERIMENTIAL_RUNC(4, 0, 1, 0, 0)
DEFINE_EXPERIMENTIAL_RUNC(5, 0, 1, 0, 1)
DEFINE_EXPERIMENTIAL_RUNC(6, 0, 1, 1, 0)
DEFINE_EXPERIMENTIAL_RUNC(7, 0, 1, 1, 1)
DEFINE_EXPERIMENTIAL_RUNC(8, 1, 0, 0, 0)
DEFINE_EXPERIMENTIAL_RUNC(9, 1, 0, 0, 1)
DEFINE_EXPERIMENTIAL_RUNC(A, 1, 0, 1, 0)
DEFINE_EXPERIMENTIAL_RUNC(B, 1, 0, 1, 1)
DEFINE_EXPERIMENTIAL_RUNC(C, 1, 1, 0, 0)
DEFINE_EXPERIMENTIAL_RUNC(D, 1, 1, 0, 1)
DEFINE_EXPERIMENTIAL_RUNC(E, 1, 1, 1, 0)
DEFINE_EXPERIMENTIAL_RUNC(F, 1, 1, 1, 1)

// Scheduler structure for function execution
struct {
    pcd run;
} scaduler = {
    .run = pcdef(target) {_runc_0(target)}
};

#endif