//Customizable constants #ifndef LTALLOC_SIZE_CLASSES_SUBPOWER_OF_TWO #define LTALLOC_SIZE_CLASSES_SUBPOWER_OF_TWO 2//determines how accurately size classes are spaced (i.e. when = 0, allocation requests are rounded up to the nearest power of two (2^n), when = 1, rounded to 2^n, or (2^n)*1.5, when = 2, rounded to 2^n, (2^n)*1.25, (2^n)*1.5, or (2^n)*1.75, and so on); this parameter have direct influence on memory fragmentation - bigger values lead to reducing internal fragmentation (which can be approximately estimated as pow(0.5, VALUE)*100%), but at the same time increasing external fragmentation #endif #define CHUNK_SIZE (64*1024)//size of chunk (basic allocation unit for all allocations of size <= MAX_BLOCK_SIZE); must be a power of two (as well as all following parameters), also should not be less than allocation granularity on Windows (which is always 64K by design of NT kernel) #define CACHE_LINE_SIZE 64 #define MAX_NUM_OF_BLOCKS_IN_BATCH 256//maximum number of blocks to move between a thread cache and a central cache in one shot static const unsigned int MAX_BATCH_SIZE = 64*1024;//maximum total size of blocks to move between a thread cache and a central cache in one shot (corresponds to half size of thread cache of each size class) static const unsigned int MAX_BLOCK_SIZE = CHUNK_SIZE;//requesting memory of any size greater than this value will lead to direct call of system virtual memory allocation routine //Platform-specific stuff #ifdef __cplusplus #define CPPCODE(code) code #include #else #define CPPCODE(code) #endif #define __STDC_LIMIT_MACROS #include //for SIZE_MAX #include //for UINT_MAX #define alignas(a) __attribute__((aligned(a))) #define NOINLINE __attribute__((noinline)) #define CAS_LOCK(lock) __sync_lock_test_and_set(lock, 1) #define SPINLOCK_RELEASE(lock) __sync_lock_release(lock) #define PAUSE __asm__ __volatile__("pause" ::: "memory") #define BSR(r, v) r = CODE3264(__builtin_clz(v) ^ 31, __builtin_clzll(v) ^ 63) //x ^ 31 = 31 - x, but gcc does not optimize 31 - __builtin_clz(x) to bsr(x), but generates 31 - (bsr(x) ^ 31) //#define likely(x) __builtin_expect(!!(x), 1) //#define unlikely(x) __builtin_expect(!!(x), 0) //#define likely(x) __builtin_expect(static_cast(x), 1) //#define unlikely(x) __builtin_expect(static_cast(x), 0) static void SPINLOCK_ACQUIRE (volatile int *lock) { if (CAS_LOCK(lock)) while (*lock || CAS_LOCK(lock)) PAUSE; } //#include //#include //for memset #if SIZE_MAX == UINT_MAX #define CODE3264(c32, c64) c32 #else #define CODE3264(c32, c64) c64 #endif typedef char CODE3264_check[sizeof(void*) == CODE3264(4, 8) ? 1 : -1]; #define WIN32_LEAN_AND_MEAN #include static NOINLINE void sys_free(void *p) { if (p == NULL) return; VirtualFree(p, 0, MEM_RELEASE); } //End of platform-specific stuff #define MAX_BLOCK_SIZE (MAX_BLOCK_SIZE < CHUNK_SIZE - (CHUNK_SIZE >> (1 + LTALLOC_SIZE_CLASSES_SUBPOWER_OF_TWO)) ? \ MAX_BLOCK_SIZE : CHUNK_SIZE - (CHUNK_SIZE >> (1 + LTALLOC_SIZE_CLASSES_SUBPOWER_OF_TWO))) #define NUMBER_OF_SIZE_CLASSES ((sizeof(void*)*8 + 1) << LTALLOC_SIZE_CLASSES_SUBPOWER_OF_TWO) typedef struct FreeBlock { struct FreeBlock *next, *nextBatch;//in the central cache blocks are organized into batches to allow fast moving blocks from thread cache and back } FreeBlock; typedef struct alignas(CACHE_LINE_SIZE) ChunkBase//force sizeof(Chunk) = cache line size to avoid false sharing { unsigned int sizeClass; } Chunk; typedef struct alignas(CACHE_LINE_SIZE) ChunkSm//chunk of smallest blocks of size = sizeof(void*) { unsigned int sizeClass;//struct ChunkBase chunk; struct ChunkSm *prev, *next; int numBatches; #define NUM_OF_BATCHES_IN_CHUNK_SM CHUNK_SIZE/(sizeof(void*)*MAX_NUM_OF_BLOCKS_IN_BATCH) FreeBlock *batches[NUM_OF_BATCHES_IN_CHUNK_SM];//batches of blocks inside ChunkSm have to be stored separately (as smallest blocks of size = sizeof(void*) do not have enough space to store second pointer for the batch) } ChunkSm; typedef struct alignas(CACHE_LINE_SIZE)//align needed to prevent cache line sharing between adjacent classes accessed from different threads { volatile int lock; unsigned int freeBlocksInLastChunk; char *lastChunk;//Chunk or ChunkSm union { FreeBlock *firstBatch; ChunkSm *chunkWithFreeBatches; }; FreeBlock *freeList;//short list of free blocks that for some reason are not organized into batches unsigned int freeListSize;//should be less than batch size uintptr_t minChunkAddr, maxChunkAddr; } CentralCache; static CentralCache centralCache[NUMBER_OF_SIZE_CLASSES];// = {{0}}; typedef struct { FreeBlock *freeList; FreeBlock *tempList;//intermediate list providing a hysteresis in order to avoid a corner case of too frequent moving free blocks to the central cache and back from int counter;//number of blocks in freeList (used to determine when to move free blocks list to the central cache) } ThreadCache; //static thread_local ThreadCache threadCache[NUMBER_OF_SIZE_CLASSES];// = {{0}}; static ThreadCache threadCache[NUMBER_OF_SIZE_CLASSES];// = {{0}}; static struct { volatile int lock; void *freeChunk; size_t size; } pad = {0, NULL, 0}; static CPPCODE(inline) unsigned int get_size_class(size_t size) { unsigned int index; #if _MSC_VER && LTALLOC_SIZE_CLASSES_SUBPOWER_OF_TWO == 2 //have to use a const array here, because MS compiler unfortunately does not evaluate _BitScanReverse with a constant argument at compile time (as gcc does for __builtin_clz) static const unsigned char small_size_classes[256] = { 131, 4, 15, 17, 19, 20, 21, 22, 23, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28, 28, 28, 29, 29, 29, 29, 30, 30, 30, 30, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43 }; if (size < 256 * sizeof(void*) - (sizeof(void*)-1)) return small_size_classes[(size + (sizeof(void*)-1)) / sizeof(void*)]; #endif size = (size + (sizeof(void*)-1)) & ~(sizeof(void*)-1);//minimum block size is sizeof(void*), doing this is better than just "size = max(size, sizeof(void*))" BSR(index, (size-1)|1);//"|1" needed because result of BSR is undefined for zero input #if LTALLOC_SIZE_CLASSES_SUBPOWER_OF_TWO == 0 return index; #else return (index<> (index-LTALLOC_SIZE_CLASSES_SUBPOWER_OF_TWO)); #endif } static unsigned int class_to_size(unsigned int c) { #if LTALLOC_SIZE_CLASSES_SUBPOWER_OF_TWO == 0 return 2 << c; #else #if LTALLOC_SIZE_CLASSES_SUBPOWER_OF_TWO >= CODE3264(2, 3) if (c < (LTALLOC_SIZE_CLASSES_SUBPOWER_OF_TWO<>LTALLOC_SIZE_CLASSES_SUBPOWER_OF_TWO); else #endif { c -= (1<>LTALLOC_SIZE_CLASSES_SUBPOWER_OF_TWO)-LTALLOC_SIZE_CLASSES_SUBPOWER_OF_TWO); } #endif } //batch_size calculates a number of blocks to move between a thread cache and a central cache in one shot static unsigned int batch_size(unsigned int sizeClass) { return ((MAX_BATCH_SIZE-1) >> (sizeClass >> LTALLOC_SIZE_CLASSES_SUBPOWER_OF_TWO)) & (MAX_NUM_OF_BLOCKS_IN_BATCH-1); } CPPCODE(template static) void *ltmalloc(size_t size); static void *fetch_from_central_cache(size_t size, ThreadCache *tc, unsigned int sizeClass) { void *p; if (size-1u <= MAX_BLOCK_SIZE-1u)//<=> if (size <= MAX_BLOCK_SIZE && size != 0) { FreeBlock *fb = tc->tempList; if (fb) { //assert(tc->counter == (int)batch_size(sizeClass)+1); tc->counter = 1; tc->freeList = fb->next; tc->tempList = NULL; return fb; } //assert(tc->counter == 0 || tc->counter == (int)batch_size(sizeClass)+1); tc->counter = 1; {CentralCache *cc = ¢ralCache[sizeClass]; SPINLOCK_ACQUIRE(&cc->lock); if (!cc->firstBatch)//no free batch {no_free_batch:{ unsigned int batchSize = batch_size(sizeClass)+1; if (cc->freeList) { //assert(cc->freeListSize); if (cc->freeListSize <= batchSize + 1) { tc->counter = batchSize - cc->freeListSize + 1; // batchSize = cc->freeListSize; cc->freeListSize = 0; fb = cc->freeList; cc->freeList = NULL; } else { cc->freeListSize -= batchSize; fb = cc->freeList; {FreeBlock *b = cc->freeList; while (--batchSize) b = b->next; cc->freeList = b->next; b->next = NULL;} } SPINLOCK_RELEASE(&cc->lock); tc->freeList = fb->next; //init_pthread_destructor();//this call must be placed carefully to allow recursive memory allocation from pthread_key_create (in case when ltalloc replaces the system malloc) return fb; } {unsigned int blockSize = class_to_size(sizeClass); if (cc->freeBlocksInLastChunk) { char *firstFree = cc->lastChunk; //assert(cc->lastChunk && cc->freeBlocksInLastChunk == (CHUNK_SIZE - ((uintptr_t)cc->lastChunk & (CHUNK_SIZE-1)))/blockSize); if (cc->freeBlocksInLastChunk < batchSize) { tc->counter = batchSize - cc->freeBlocksInLastChunk + 1; batchSize = cc->freeBlocksInLastChunk; } cc->freeBlocksInLastChunk -= batchSize; cc->lastChunk += blockSize * batchSize; if (cc->freeBlocksInLastChunk == 0) { //assert(((uintptr_t)cc->lastChunk & (CHUNK_SIZE-1)) == 0); cc->lastChunk = ((char**)cc->lastChunk)[-1]; if (cc->lastChunk) cc->freeBlocksInLastChunk = (CHUNK_SIZE - ((uintptr_t)cc->lastChunk & (CHUNK_SIZE-1)))/blockSize; } SPINLOCK_RELEASE(&cc->lock); fb = (FreeBlock*)firstFree; while (--batchSize) firstFree = (char*)(((FreeBlock*)firstFree)->next = (FreeBlock*)(firstFree + blockSize)); ((FreeBlock*)firstFree)->next = NULL; tc->freeList = fb->next; //init_pthread_destructor(); return fb; } //Allocate new chunk SPINLOCK_RELEASE(&cc->lock);//release lock for a while SPINLOCK_ACQUIRE(&pad.lock); if (pad.freeChunk) { p = pad.freeChunk; pad.freeChunk = *(void**)p; pad.size -= CHUNK_SIZE; SPINLOCK_RELEASE(&pad.lock); ((char**)((char*)p + CHUNK_SIZE))[-1] = 0; } else { SPINLOCK_RELEASE(&pad.lock); p = VirtualAlloc(NULL, CHUNK_SIZE, MEM_RESERVE|MEM_COMMIT, PAGE_READWRITE); if (!p) { return NULL; } } #define CHUNK_IS_SMALL (sizeClass < get_size_class(2*sizeof(void*))) {unsigned int numBlocksInChunk = (CHUNK_SIZE - (CHUNK_IS_SMALL ? sizeof(ChunkSm) : sizeof(Chunk)))/blockSize; #ifndef _WIN32 //deleted #endif //assert(((char**)((char*)p + CHUNK_SIZE))[-1] == 0);//assume that allocated memory is always zero filled (on first access); it is better not to zero it explicitly because it will lead to allocation of physical page which may never needed otherwise if (numBlocksInChunk < batchSize) { tc->counter = batchSize - numBlocksInChunk + 1; batchSize = numBlocksInChunk; } //Prepare chunk ((Chunk*)p)->sizeClass = sizeClass; {char *firstFree = (char*)p + CHUNK_SIZE - numBlocksInChunk*blockSize;//blocks in chunk are located in such way to achieve a maximum possible alignment fb = (FreeBlock*)firstFree; {int n = batchSize; while (--n) firstFree = (char*)(((FreeBlock*)firstFree)->next = (FreeBlock*)(firstFree + blockSize));} ((FreeBlock*)firstFree)->next = NULL; firstFree += blockSize; SPINLOCK_ACQUIRE(&cc->lock); if ((uintptr_t)p < cc->minChunkAddr || !cc->minChunkAddr) cc->minChunkAddr = (uintptr_t)p; if ((uintptr_t)p > cc->maxChunkAddr ) cc->maxChunkAddr = (uintptr_t)p; if (CHUNK_IS_SMALL)//special handling for smallest blocks of size = sizeof(void*) { ChunkSm *cs = (ChunkSm*)p; cs->numBatches = 0; //Insert new chunk right after chunkWithFreeBatches cs->prev = cc->chunkWithFreeBatches; if (cc->chunkWithFreeBatches) { cs->next = cc->chunkWithFreeBatches->next; if (cc->chunkWithFreeBatches->next) cc->chunkWithFreeBatches->next->prev = cs; cc->chunkWithFreeBatches->next = cs; } else { cs->next = NULL; cc->chunkWithFreeBatches = cs; } } if (cc->freeBlocksInLastChunk)//so happened that other thread have already allocated chunk for the same size class while the lock was released { //Hook pointer to the current lastChunk at the end of new chunk (another way is just put all blocks to cc->freeList which is much less effecient) ((char**)(((uintptr_t)firstFree & ~(CHUNK_SIZE-1)) + CHUNK_SIZE))[-1] = cc->lastChunk; } cc->freeBlocksInLastChunk = numBlocksInChunk - batchSize; cc->lastChunk = firstFree; }}}}} else { if (!CHUNK_IS_SMALL)//smallest blocks of size = sizeof(void*) are handled specially { fb = cc->firstBatch; cc->firstBatch = fb->nextBatch; } else//size of block = sizeof(void*) { ChunkSm *cs = cc->chunkWithFreeBatches; if (cs->numBatches == 0) { if (cs->prev == NULL) goto no_free_batch; cs = cc->chunkWithFreeBatches = cs->prev; //assert(cs->numBatches == NUM_OF_BATCHES_IN_CHUNK_SM); } fb = cs->batches[--cs->numBatches]; } } SPINLOCK_RELEASE(&cc->lock);} tc->freeList = fb->next; //init_pthread_destructor(); return fb; } else//allocate block directly from the system { size = (size + CHUNK_SIZE-1) & ~(CHUNK_SIZE-1); p = VirtualAlloc(NULL, size, MEM_RESERVE|MEM_COMMIT, PAGE_READWRITE); return p; } } void *ltmalloc(size_t size) { unsigned int sizeClass = get_size_class(size); ThreadCache *tc = &threadCache[sizeClass]; FreeBlock *fb = tc->freeList; if (fb) { tc->freeList = fb->next; tc->counter++; return fb; } else return fetch_from_central_cache (size, tc, sizeClass); } void ltfree(void *p) { if ((uintptr_t)p & (CHUNK_SIZE-1)) { unsigned int sizeClass = ((Chunk*)((uintptr_t)p & ~(CHUNK_SIZE-1)))->sizeClass; ThreadCache *tc = &threadCache[sizeClass]; // if (unlikely(--tc->counter < 0)) // move_to_central_cache(tc, sizeClass); ((FreeBlock*)p)->next = tc->freeList; tc->freeList = (FreeBlock*)p; } else sys_free(p); } size_t ltmsize(void *p) { if ((uintptr_t)p & (CHUNK_SIZE-1)) { return class_to_size(((Chunk*)((uintptr_t)p & ~(CHUNK_SIZE-1)))->sizeClass); } else { if (p == NULL) return 0; {MEMORY_BASIC_INFORMATION mi; VirtualQuery(p, &mi, sizeof(mi)); return mi.RegionSize;} } } void* operator new[](size_t size, const char* /*name*/, int /*flags*/, unsigned /*debugFlags*/, const char* /*file*/, int /*line*/) { return ltmalloc(size); } void* operator new[](size_t size, size_t alignment, size_t alignmentOffset, const char* /*name*/, int flags, unsigned /*debugFlags*/, const char* /*file*/, int /*line*/) { // Substitute your aligned malloc. //return malloc_aligned(size, alignment, alignmentOffset); return ltmalloc(size); } void* operator new(size_t size) { return ltmalloc(size); } void* operator new[](size_t size) { return ltmalloc(size); } void operator delete(void* p) { if(p) // The standard specifies that 'delete NULL' is a valid operation. ltfree(p); } void operator delete[](void* p) { if(p) ltfree(p); } /* } */