


//Customizable constants
#ifndef LTALLOC_SIZE_CLASSES_SUBPOWER_OF_TWO
#define LTALLOC_SIZE_CLASSES_SUBPOWER_OF_TWO 2//determines how accurately size classes are spaced (i.e. when = 0, allocation requests are rounded up to the nearest power of two (2^n), when = 1, rounded to 2^n, or (2^n)*1.5, when = 2, rounded to 2^n, (2^n)*1.25, (2^n)*1.5, or (2^n)*1.75, and so on); this parameter have direct influence on memory fragmentation - bigger values lead to reducing internal fragmentation (which can be approximately estimated as pow(0.5, VALUE)*100%), but at the same time increasing external fragmentation
#endif
#define CHUNK_SIZE     (64*1024)//size of chunk (basic allocation unit for all allocations of size <= MAX_BLOCK_SIZE); must be a power of two (as well as all following parameters), also should not be less than allocation granularity on Windows (which is always 64K by design of NT kernel)
#define CACHE_LINE_SIZE 64
#define MAX_NUM_OF_BLOCKS_IN_BATCH 256//maximum number of blocks to move between a thread cache and a central cache in one shot
static const unsigned int MAX_BATCH_SIZE = 64*1024;//maximum total size of blocks to move between a thread cache and a central cache in one shot (corresponds to half size of thread cache of each size class)
static const unsigned int MAX_BLOCK_SIZE = CHUNK_SIZE;//requesting memory of any size greater than this value will lead to direct call of system virtual memory allocation routine

//Platform-specific stuff

#ifdef __cplusplus
#define CPPCODE(code) code
#include <new>
#else
#define CPPCODE(code)
#endif



#define __STDC_LIMIT_MACROS
#include <stdint.h> //for SIZE_MAX
#include <limits.h> //for UINT_MAX
#define alignas(a) __attribute__((aligned(a)))

#define NOINLINE __attribute__((noinline))
#define CAS_LOCK(lock) __sync_lock_test_and_set(lock, 1)
#define SPINLOCK_RELEASE(lock) __sync_lock_release(lock)
#define PAUSE __asm__ __volatile__("pause" ::: "memory")
#define BSR(r, v) r = CODE3264(__builtin_clz(v) ^ 31, __builtin_clzll(v) ^ 63)

//x ^ 31 = 31 - x, but gcc does not optimize 31 - __builtin_clz(x) to bsr(x), but generates 31 - (bsr(x) ^ 31)




//#define likely(x) __builtin_expect(!!(x), 1)
//#define unlikely(x) __builtin_expect(!!(x), 0)

//#define likely(x) __builtin_expect(static_cast<bool>(x), 1)
//#define unlikely(x) __builtin_expect(static_cast<bool>(x), 0)


static void SPINLOCK_ACQUIRE (volatile int *lock) {
    if (CAS_LOCK(lock))
        while (*lock || CAS_LOCK(lock))
            PAUSE;
}

//#include <assert.h>
//#include <string.h> //for memset

#if SIZE_MAX == UINT_MAX
#define CODE3264(c32, c64) c32
#else
#define CODE3264(c32, c64) c64
#endif
typedef char CODE3264_check[sizeof(void*) == CODE3264(4, 8) ? 1 : -1];


#define WIN32_LEAN_AND_MEAN
#include <windows.h>




static NOINLINE void sys_free(void *p)
{
	if (p == NULL) return;
	VirtualFree(p, 0, MEM_RELEASE);
}



//End of platform-specific stuff

#define MAX_BLOCK_SIZE (MAX_BLOCK_SIZE < CHUNK_SIZE - (CHUNK_SIZE >> (1 + LTALLOC_SIZE_CLASSES_SUBPOWER_OF_TWO)) ? \
                        MAX_BLOCK_SIZE : CHUNK_SIZE - (CHUNK_SIZE >> (1 + LTALLOC_SIZE_CLASSES_SUBPOWER_OF_TWO)))
#define NUMBER_OF_SIZE_CLASSES ((sizeof(void*)*8 + 1) << LTALLOC_SIZE_CLASSES_SUBPOWER_OF_TWO)

typedef struct FreeBlock
{
	struct FreeBlock *next,
                     *nextBatch;//in the central cache blocks are organized into batches to allow fast moving blocks from thread cache and back
} FreeBlock;

typedef struct alignas(CACHE_LINE_SIZE) ChunkBase//force sizeof(Chunk) = cache line size to avoid false sharing
{
	unsigned int sizeClass;
} Chunk;

typedef struct alignas(CACHE_LINE_SIZE) ChunkSm//chunk of smallest blocks of size = sizeof(void*)
{
	unsigned int sizeClass;//struct ChunkBase chunk;
	struct ChunkSm *prev, *next;
	int numBatches;
#define NUM_OF_BATCHES_IN_CHUNK_SM CHUNK_SIZE/(sizeof(void*)*MAX_NUM_OF_BLOCKS_IN_BATCH)
	FreeBlock *batches[NUM_OF_BATCHES_IN_CHUNK_SM];//batches of blocks inside ChunkSm have to be stored separately (as smallest blocks of size = sizeof(void*) do not have enough space to store second pointer for the batch)
} ChunkSm;

typedef struct alignas(CACHE_LINE_SIZE)//align needed to prevent cache line sharing between adjacent classes accessed from different threads
{
	volatile int lock;
	unsigned int freeBlocksInLastChunk;
	char *lastChunk;//Chunk or ChunkSm
	union {
		FreeBlock *firstBatch;
		ChunkSm *chunkWithFreeBatches;
	};
	FreeBlock *freeList;//short list of free blocks that for some reason are not organized into batches
	unsigned int freeListSize;//should be less than batch size
	uintptr_t minChunkAddr, maxChunkAddr;
} CentralCache;
static CentralCache centralCache[NUMBER_OF_SIZE_CLASSES];// = {{0}};

typedef struct
{
	FreeBlock *freeList;
	FreeBlock *tempList;//intermediate list providing a hysteresis in order to avoid a corner case of too frequent moving free blocks to the central cache and back from
	int counter;//number of blocks in freeList (used to determine when to move free blocks list to the central cache)
} ThreadCache;

//static thread_local ThreadCache threadCache[NUMBER_OF_SIZE_CLASSES];// = {{0}};
static ThreadCache threadCache[NUMBER_OF_SIZE_CLASSES];// = {{0}};


static struct
{
	volatile int lock;
	void *freeChunk;
	size_t size;
} pad = {0, NULL, 0};

static CPPCODE(inline) unsigned int get_size_class(size_t size)
{
	unsigned int index;
#if _MSC_VER && LTALLOC_SIZE_CLASSES_SUBPOWER_OF_TWO == 2

    //have to use a const array here, because MS compiler unfortunately does not evaluate _BitScanReverse with a constant argument at compile time (as gcc does for __builtin_clz)

	static const unsigned char small_size_classes[256] = {
		131, 4, 15, 17, 19, 20, 21, 22, 23, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28, 28, 28, 29, 29, 29, 29, 30, 30, 30, 30, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43
	};
	if (size < 256 * sizeof(void*) - (sizeof(void*)-1))
		return small_size_classes[(size + (sizeof(void*)-1)) / sizeof(void*)];
#endif

	size = (size + (sizeof(void*)-1)) & ~(sizeof(void*)-1);//minimum block size is sizeof(void*), doing this is better than just "size = max(size, sizeof(void*))"

	BSR(index, (size-1)|1);//"|1" needed because result of BSR is undefined for zero input
#if LTALLOC_SIZE_CLASSES_SUBPOWER_OF_TWO == 0
	return index;
#else
	return (index<<LTALLOC_SIZE_CLASSES_SUBPOWER_OF_TWO) + (unsigned int)((size-1) >> (index-LTALLOC_SIZE_CLASSES_SUBPOWER_OF_TWO));
#endif
}

static unsigned int class_to_size(unsigned int c)
{
#if LTALLOC_SIZE_CLASSES_SUBPOWER_OF_TWO == 0
	return 2 << c;
#else
#if LTALLOC_SIZE_CLASSES_SUBPOWER_OF_TWO >= CODE3264(2, 3)
	if (c < (LTALLOC_SIZE_CLASSES_SUBPOWER_OF_TWO<<LTALLOC_SIZE_CLASSES_SUBPOWER_OF_TWO))//for block sizes less than or equal to pow(2, LTALLOC_SIZE_CLASSES_SUBPOWER_OF_TWO)
		return 2 << (c>>LTALLOC_SIZE_CLASSES_SUBPOWER_OF_TWO);
	else
#endif
	{
		c -= (1<<LTALLOC_SIZE_CLASSES_SUBPOWER_OF_TWO)-1;
		return ((c & ((1<<LTALLOC_SIZE_CLASSES_SUBPOWER_OF_TWO)-1)) | (1<<LTALLOC_SIZE_CLASSES_SUBPOWER_OF_TWO)) << ((c>>LTALLOC_SIZE_CLASSES_SUBPOWER_OF_TWO)-LTALLOC_SIZE_CLASSES_SUBPOWER_OF_TWO);
	}
#endif
}


//batch_size calculates a number of blocks to move between a thread cache and a central cache in one shot
static unsigned int batch_size(unsigned int sizeClass)
{
	return ((MAX_BATCH_SIZE-1) >> (sizeClass >> LTALLOC_SIZE_CLASSES_SUBPOWER_OF_TWO)) & (MAX_NUM_OF_BLOCKS_IN_BATCH-1);
}

CPPCODE(template <bool> static) void *ltmalloc(size_t size);

static void *fetch_from_central_cache(size_t size, ThreadCache *tc, unsigned int sizeClass)
{
	void *p;
	if (size-1u <= MAX_BLOCK_SIZE-1u)//<=> if (size <= MAX_BLOCK_SIZE && size != 0)
	{
		FreeBlock *fb = tc->tempList;
		if (fb)
		{
			//assert(tc->counter == (int)batch_size(sizeClass)+1);
			tc->counter = 1;
			tc->freeList = fb->next;
			tc->tempList = NULL;
			return fb;
		}

		//assert(tc->counter == 0 || tc->counter == (int)batch_size(sizeClass)+1);
		tc->counter = 1;

		{CentralCache *cc = &centralCache[sizeClass];
		SPINLOCK_ACQUIRE(&cc->lock);
		if (!cc->firstBatch)//no free batch
		{no_free_batch:{
			unsigned int batchSize = batch_size(sizeClass)+1;

			if (cc->freeList)
			{
				//assert(cc->freeListSize);
				if (cc->freeListSize <= batchSize + 1)
				{
					tc->counter = batchSize - cc->freeListSize + 1;
				//	batchSize = cc->freeListSize;
					cc->freeListSize = 0;
					fb = cc->freeList;
					cc->freeList = NULL;
				}
				else
				{
					cc->freeListSize -= batchSize;
					fb = cc->freeList;
					{FreeBlock *b = cc->freeList;
					while (--batchSize) b = b->next;
					cc->freeList = b->next;
					b->next = NULL;}
				}
				SPINLOCK_RELEASE(&cc->lock);
				tc->freeList = fb->next;
				//init_pthread_destructor();//this call must be placed carefully to allow recursive memory allocation from pthread_key_create (in case when ltalloc replaces the system malloc)
				return fb;
			}

			{unsigned int blockSize = class_to_size(sizeClass);
			if (cc->freeBlocksInLastChunk)
			{
				char *firstFree = cc->lastChunk;
				//assert(cc->lastChunk && cc->freeBlocksInLastChunk == (CHUNK_SIZE - ((uintptr_t)cc->lastChunk & (CHUNK_SIZE-1)))/blockSize);
				if (cc->freeBlocksInLastChunk < batchSize) {
					tc->counter = batchSize - cc->freeBlocksInLastChunk + 1;
					batchSize = cc->freeBlocksInLastChunk;
				}
				cc->freeBlocksInLastChunk -= batchSize;
				cc->lastChunk += blockSize * batchSize;
				if (cc->freeBlocksInLastChunk == 0) {
					//assert(((uintptr_t)cc->lastChunk & (CHUNK_SIZE-1)) == 0);
					cc->lastChunk = ((char**)cc->lastChunk)[-1];
					if (cc->lastChunk)
						cc->freeBlocksInLastChunk = (CHUNK_SIZE - ((uintptr_t)cc->lastChunk & (CHUNK_SIZE-1)))/blockSize;
				}
				SPINLOCK_RELEASE(&cc->lock);
				fb = (FreeBlock*)firstFree;
				while (--batchSize)
					firstFree = (char*)(((FreeBlock*)firstFree)->next = (FreeBlock*)(firstFree + blockSize));
				((FreeBlock*)firstFree)->next = NULL;
				tc->freeList = fb->next;
				//init_pthread_destructor();
				return fb;
			}

			//Allocate new chunk
			SPINLOCK_RELEASE(&cc->lock);//release lock for a while

			SPINLOCK_ACQUIRE(&pad.lock);
			if (pad.freeChunk)
			{
				p = pad.freeChunk;
				pad.freeChunk = *(void**)p;
				pad.size -= CHUNK_SIZE;
				SPINLOCK_RELEASE(&pad.lock);
				((char**)((char*)p + CHUNK_SIZE))[-1] = 0;
			} else {
				SPINLOCK_RELEASE(&pad.lock);
                p = VirtualAlloc(NULL, CHUNK_SIZE, MEM_RESERVE|MEM_COMMIT, PAGE_READWRITE);
				if (!p) { return NULL; }
			}

#define CHUNK_IS_SMALL (sizeClass < get_size_class(2*sizeof(void*)))
			{unsigned int numBlocksInChunk = (CHUNK_SIZE - (CHUNK_IS_SMALL ? sizeof(ChunkSm) : sizeof(Chunk)))/blockSize;
#ifndef _WIN32
			//deleted
#endif
			//assert(((char**)((char*)p + CHUNK_SIZE))[-1] == 0);//assume that allocated memory is always zero filled (on first access); it is better not to zero it explicitly because it will lead to allocation of physical page which may never needed otherwise
			if (numBlocksInChunk < batchSize) {
				tc->counter = batchSize - numBlocksInChunk + 1;
				batchSize = numBlocksInChunk;
			}

			//Prepare chunk
			((Chunk*)p)->sizeClass = sizeClass;
			{char *firstFree = (char*)p + CHUNK_SIZE - numBlocksInChunk*blockSize;//blocks in chunk are located in such way to achieve a maximum possible alignment
			fb = (FreeBlock*)firstFree;
			{int n = batchSize; while (--n)
				firstFree = (char*)(((FreeBlock*)firstFree)->next = (FreeBlock*)(firstFree + blockSize));}
			((FreeBlock*)firstFree)->next = NULL;
			firstFree += blockSize;

			SPINLOCK_ACQUIRE(&cc->lock);
			if ((uintptr_t)p < cc->minChunkAddr || !cc->minChunkAddr) cc->minChunkAddr = (uintptr_t)p;
			if ((uintptr_t)p > cc->maxChunkAddr                     ) cc->maxChunkAddr = (uintptr_t)p;

			if (CHUNK_IS_SMALL)//special handling for smallest blocks of size = sizeof(void*)
			{
				ChunkSm *cs = (ChunkSm*)p;
				cs->numBatches = 0;
				//Insert new chunk right after chunkWithFreeBatches
				cs->prev = cc->chunkWithFreeBatches;
				if (cc->chunkWithFreeBatches) {
					cs->next = cc->chunkWithFreeBatches->next;
					if (cc->chunkWithFreeBatches->next) cc->chunkWithFreeBatches->next->prev = cs;
					cc->chunkWithFreeBatches->next = cs;
				} else {
					cs->next = NULL;
					cc->chunkWithFreeBatches = cs;
				}
			}

			if (cc->freeBlocksInLastChunk)//so happened that other thread have already allocated chunk for the same size class while the lock was released
			{
				//Hook pointer to the current lastChunk at the end of new chunk (another way is just put all blocks to cc->freeList which is much less effecient)
				((char**)(((uintptr_t)firstFree & ~(CHUNK_SIZE-1)) + CHUNK_SIZE))[-1] = cc->lastChunk;
			}
			cc->freeBlocksInLastChunk = numBlocksInChunk - batchSize;
			cc->lastChunk = firstFree;
		}}}}}
		else {
			if (!CHUNK_IS_SMALL)//smallest blocks of size = sizeof(void*) are handled specially
			{
				fb = cc->firstBatch;
				cc->firstBatch = fb->nextBatch;
			}
			else//size of block = sizeof(void*)
			{
				ChunkSm *cs = cc->chunkWithFreeBatches;
				if (cs->numBatches == 0)
				{
					if (cs->prev == NULL) goto no_free_batch;
					cs = cc->chunkWithFreeBatches = cs->prev;
					//assert(cs->numBatches == NUM_OF_BATCHES_IN_CHUNK_SM);
				}
				fb = cs->batches[--cs->numBatches];
			}
		}
		SPINLOCK_RELEASE(&cc->lock);}
		tc->freeList = fb->next;
		//init_pthread_destructor();
		return fb;
	}
	else//allocate block directly from the system
	{
		size = (size + CHUNK_SIZE-1) & ~(CHUNK_SIZE-1);
        p = VirtualAlloc(NULL, size, MEM_RESERVE|MEM_COMMIT, PAGE_READWRITE);
		return p;
	}
}

void *ltmalloc(size_t size)
{
	unsigned int sizeClass = get_size_class(size);
	ThreadCache *tc = &threadCache[sizeClass];
	FreeBlock *fb = tc->freeList;
	if (fb)
	{
		tc->freeList = fb->next;
		tc->counter++;
		return fb;
	}
	else
		return fetch_from_central_cache (size, tc, sizeClass);
}



void ltfree(void *p)
{
	if ((uintptr_t)p & (CHUNK_SIZE-1))
	{
		unsigned int sizeClass = ((Chunk*)((uintptr_t)p & ~(CHUNK_SIZE-1)))->sizeClass;
		ThreadCache *tc = &threadCache[sizeClass];

		// if (unlikely(--tc->counter < 0))
		// 	move_to_central_cache(tc, sizeClass);

		((FreeBlock*)p)->next = tc->freeList;
		tc->freeList = (FreeBlock*)p;
	}
	else
		sys_free(p);
}

size_t ltmsize(void *p)
{
	if ((uintptr_t)p & (CHUNK_SIZE-1))
	{
		return class_to_size(((Chunk*)((uintptr_t)p & ~(CHUNK_SIZE-1)))->sizeClass);
	}
	else
	{
		if (p == NULL) return 0;
		{MEMORY_BASIC_INFORMATION mi;
		VirtualQuery(p, &mi, sizeof(mi));
		return mi.RegionSize;}
	}
}





void* operator new[](size_t size, const char* /*name*/, int /*flags*/,
					 unsigned /*debugFlags*/, const char* /*file*/, int /*line*/)
{
	return ltmalloc(size);
}

void* operator new[](size_t size, size_t alignment, size_t alignmentOffset, const char* /*name*/,
					 int flags, unsigned /*debugFlags*/, const char* /*file*/, int /*line*/)
{
	// Substitute your aligned malloc.
	//return malloc_aligned(size, alignment, alignmentOffset);
	return ltmalloc(size);
}


void* operator new(size_t size)
{
	return ltmalloc(size);
}

void* operator new[](size_t size)
{
	return ltmalloc(size);
}


void operator delete(void* p)
{
	if(p) // The standard specifies that 'delete NULL' is a valid operation.
		ltfree(p);
}
void operator delete[](void* p)
{
	if(p)
		ltfree(p);
}

/* } */
