/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ /* * Copyright (C) 2011 Norbert Thiebaud * License: GPLv3 */ /* define to activate stats reporting on hash usage*/ /* #define HASH_STAT */ /* =============================================== * Set-up: defines to identify the system and system related properties * =============================================== */ #ifdef __APPLE__ #ifdef __x86_64__ #define CORE_BIG_ENDIAN 0 #define CORE_LITTLE_ENDIAN 1 #define USE_MEMORY_ALIGNMENT 64 /* big value -> no alignment */ #else #define CORE_BIG_ENDIAN 1 #define CORE_LITTLE_ENDIAN 0 #define USE_MEMORY_ALIGNMENT 4 #endif #endif #ifdef _AIX #define CORE_BIG_ENDIAN 1 #define CORE_LITTLE_ENDIAN 0 #define USE_MEMORY_ALIGNMENT 4 #endif /* Def _AIX */ #ifdef __CYGWIN__ #define __windows #define CORE_BIG_ENDIAN 0 #define CORE_LITTLE_ENDIAN 1 #define USE_MEMORY_ALIGNMENT 64 /* big value -> no alignment */ #endif /* Def __CYGWIN__ */ #if defined(__linux) || defined(__OpenBSD__) || \ defined(__FreeBSD__) || defined(__NetBSD__) || \ defined(__DragonFly__) #if __BYTE_ORDER == __LITTLE_ENDIAN #define CORE_BIG_ENDIAN 0 #define CORE_LITTLE_ENDIAN 1 #if defined(__x86_64) || defined(__i386) #define USE_MEMORY_ALIGNMENT 64 #else #define USE_MEMORY_ALIGNMENT 4 #endif #else /* !(__BYTE_ORDER == __LITTLE_ENDIAN) */ #if __BYTE_ORDER == __BIG_ENDIAN #define CORE_BIG_ENDIAN 1 #define CORE_LITTLE_ENDIAN 0 #define USE_MEMORY_ALIGNMENT 4 #endif /* __BYTE_ORDER == __BIG_ENDIAN */ #endif /* !(__BYTE_ORDER == __LITTLE_ENDIAN) */ #endif /* Def __linux || Def *BSD */ #ifdef __sun #ifdef __sparc #define CORE_BIG_ENDIAN 1 #define CORE_LITTLE_ENDIAN 0 #define USE_MEMORY_ALIGNMENT 4 #else /* Ndef __sparc */ #define CORE_BIG_ENDIAN 0 #define CORE_LITTLE_ENDIAN 1 #define USE_MEMORY_ALIGNMENT 4 #endif /* Ndef __sparc */ #endif /* Def __sun */ /* Note USE_MEMORY_ALIGNMENT is 4 for platform that allow short non-aligned but required int access to be aligned (e.g sparc, ppc, zos..) * USE_MEMORY_ALIGNMENT is 2 for platform that require short and int access to be aligned (e.g hppa ) * if the platform does not have alignment requirement (x86/amd64) use a big value (i.e > 16) */ #ifndef USE_MEMORY_ALIGNMENT #error "USE_MEMORY_ALIGNMENT must be defined to the proper alignment value for the platform" #endif #include #include #include #include #include #include #include #include #include #ifdef __windows #include #else #include #endif /* modes */ #ifdef __windows #define FILE_O_RDONLY _O_RDONLY #define FILE_O_BINARY _O_BINARY #else /* not windaube */ #define FILE_O_RDONLY O_RDONLY #define FILE_O_BINARY 0 #endif /* not windaube */ #ifndef TRUE #define TRUE 1 #endif #ifndef FALSE #define FALSE 0 #endif static char* base_dir_var = "$(SRCDIR)"; #define kBASE_DIR_VAR_LENGTH 9 static char* base_dir; #ifdef __GNUC__ #define clz __builtin_clz #else static inline int clz(unsigned int value) { int result = 32; while(value) { value >>= 1; result -= 1; } return result; } #endif #if (USE_MEMORY_ALIGNMENT > 4) #define get_unaligned_uint(str) (*(unsigned int*)(str)) #else static inline unsigned int get_unaligned_uint(const unsigned char* cursor) { unsigned int result; memcpy(&result, cursor, sizeof(unsigned int)); return result; } #endif /* =============================================== * memory pool for fast fix-size allocation (non-tread-safe) * =============================================== */ struct pool { void* head_free; /**< head of a linked list of freed element */ char* fresh; /**< top of a memory block to dig new element */ char* tail; /**< to detect end of extent... when fresh pass tail */ void* extent; /**< pointer to the primary extent block */ int size_elem; /**< size of an element. */ int primary; /**< primary allocation in bytes */ int secondary; /**< secondary allocation in bytes */ }; #define POOL_ALIGN_INCREMENT 8 /**< Alignement, must be a power of 2 and of size > to sizeof(void*) */ static void* pool_take_extent(struct pool* pool, int allocate) { unsigned int size = 0; void* extent; void* data = NULL; if(pool->extent) { fputs("taking a pool extent\n", stderr); /* we already have an extent, so this is a secondary */ if(pool->secondary) { size = pool->secondary; } } else { assert(pool->primary); size = pool->primary; } if(size) { extent = malloc(size); if(extent) { *(void**)extent = pool->extent; pool->extent = extent; if(allocate) { data = ((char*)extent) + POOL_ALIGN_INCREMENT; pool->fresh = ((char*)data) + pool->size_elem; pool->tail = pool->fresh + (size - pool->size_elem); } else { pool->fresh = ((char*)extent) + POOL_ALIGN_INCREMENT; pool->tail = pool->fresh + (size - pool->size_elem); } } } return data; } /* Create a memory pool for fix size objects * this is a simplified implementation that * is _not_ thread safe. */ struct pool* pool_create(int size_elem, int flags, int primary, int secondary) { struct pool* pool; assert(primary > 0); assert(secondary >= 0); assert(size_elem > 0); pool = (struct pool*)calloc(1, sizeof(struct pool)); if(!pool) return NULL; /* Adjust the element size so that it be aligned, and so that an element could * at least contain a void* */ pool->size_elem = size_elem = (size_elem + POOL_ALIGN_INCREMENT - 1) & ~(POOL_ALIGN_INCREMENT - 1); pool->primary = (size_elem * primary) + POOL_ALIGN_INCREMENT; pool->secondary = secondary > 0 ? (size_elem * secondary) + POOL_ALIGN_INCREMENT : 0; pool_take_extent(pool, FALSE); return pool; } void pool_destroy(struct pool* pool) { void* extent; void* next; if(pool != NULL) { extent = pool->extent; while(extent) { next = *(void**)extent; free(extent); extent = next; } free(pool); } } static inline void* pool_alloc(struct pool* pool) { void* data; data = pool->head_free; if(data == NULL) { /* we have no old-freed elem */ if(pool->fresh <= pool->tail) { /* pick a slice of the current extent */ data = (void*)pool->fresh; pool->fresh += pool->size_elem; } else { /* allocate a new extent */ data = pool_take_extent(pool, TRUE); } } else { /* re-used old freed element by chopipng the head of the free list */ pool->head_free = *(void**)data; } return data; } static inline void pool_free(struct pool* pool, void* data) { assert(pool && data); /* stack on top of the free list */ *(void**)data = pool->head_free; pool->head_free = data; } /* =============================================== * Hash implementation custumized to be just tracking * a unique list of string (i.e no data associated * with the key, no need for retrieval, etc.. * * This is tuned for the particular use-case we have here * measures in tail_build showed that * we can get north of 4000 distinct values stored in a hash * the collision rate is at worse around 2% * the collision needing an expensive memcmp to resolve * have a rate typically at 1 per 1000 * for tail_build we register 37229 unique key * with a total of 377 extra memcmp needed * which is completely negligible compared to the * number of memcmp required to eliminate duplicate * entry (north of 2.5 millions for tail_build) * =============================================== */ struct hash_elem { struct hash_elem* next; const char* key; int key_len; }; struct hash { struct hash_elem** array; struct pool* elems_pool; int flags; unsigned int used; unsigned int size; unsigned int load_limit; #ifdef HASH_STAT int stored; int collisions; int cost; int memcmp; #endif }; #define HASH_F_NO_RESIZE (1<<0) /* The following hash_compute function was adapted from : * lookup3.c, by Bob Jenkins, May 2006, Public Domain. * * The changes from the original are mostly cosmetic */ #define hashsize(n) (1<<(n)) #define hashmask(n) (hashsize(n)-1) #define rot(x,k) (((x)<<(k)) | ((x)>>(32-(k)))) #if CORE_BIG_ENDIAN #define MASK_C1 0xFFFFFF00 #define MASK_C2 0xFFFF0000 #define MASK_C3 0xFF000000 #else #if CORE_LITTLE_ENDIAN #define MASK_C1 0xFFFFFF #define MASK_C2 0xFFFF #define MASK_C3 0xFF #else #error "Missing Endianness definition" #endif #endif #define mix(a,b,c) \ { \ a -= c; a ^= rot(c, 4); c += b; \ b -= a; b ^= rot(a, 6); a += c; \ c -= b; c ^= rot(b, 8); b += a; \ a -= c; a ^= rot(c,16); c += b; \ b -= a; b ^= rot(a,19); a += c; \ c -= b; c ^= rot(b, 4); b += a; \ } #define final(a,b,c) \ { \ c ^= b; c -= rot(b,14); \ a ^= c; a -= rot(c,11); \ b ^= a; b -= rot(a,25); \ c ^= b; c -= rot(b,16); \ a ^= c; a -= rot(c,4); \ b ^= a; b -= rot(a,14); \ c ^= b; c -= rot(b,24); \ } static unsigned int hash_compute( struct hash* hash, const char* key, int length) { unsigned int a; unsigned int b; unsigned int c; /* internal state */ const unsigned char* uk = (const unsigned char*)key; /* Set up the internal state */ a = b = c = 0xdeadbeef + (length << 2); /* we use this to 'hash' full path with mostly a common root * let's now waste too much cycles hashing mostly constant stuff */ if(length > 36) { uk += length - 36; length = 36; } /*------ all but last block: aligned reads and affect 32 bits of (a,b,c) */ while (length > 12) { a += get_unaligned_uint(uk); b += get_unaligned_uint(uk+4); c += get_unaligned_uint(uk+8); mix(a,b,c); length -= 12; uk += 12; } /*----------------------------- handle the last (probably partial) block */ /* Note: we possibly over-read, which would trigger complaint from VALGRIND * but we mask the undefined stuff if any, so we are still good, thanks * to alignment of memory allocation and tail-memory managment overhead * we always can read 3 bytes past the official end without triggering * a segfault -- if you find a platform/compiler couple for which that postulat * is false, then you just need to over-allocate by 2 more bytes in file_load() * file_load already over-allocate by 1 to sitck a \0 at the end of the buffer. */ switch(length) { case 12: c+=get_unaligned_uint(uk+8); b+=get_unaligned_uint(uk+4); a+=get_unaligned_uint(uk); break; case 11: c+=get_unaligned_uint(uk+8) & MASK_C1; b+=get_unaligned_uint(uk+4); a+=get_unaligned_uint(uk); break; case 10: c+=get_unaligned_uint(uk+8) & MASK_C2; b+=get_unaligned_uint(uk+4); a+=get_unaligned_uint(uk); break; case 9 : c+=get_unaligned_uint(uk+8) & MASK_C3; b+=get_unaligned_uint(uk+4); a+=get_unaligned_uint(uk); break; case 8 : b+=get_unaligned_uint(uk+4); a+=get_unaligned_uint(uk); break; case 7 : b+=get_unaligned_uint(uk+4) & MASK_C1; a+=get_unaligned_uint(uk); break; case 6 : b+=get_unaligned_uint(uk+4) & MASK_C2; a+=get_unaligned_uint(uk); break; case 5 : b+=get_unaligned_uint(uk+4) & MASK_C3; a+=get_unaligned_uint(uk); break; case 4 : a+=get_unaligned_uint(uk); break; case 3 : a+=get_unaligned_uint(uk) & MASK_C1; break; case 2 : a+=get_unaligned_uint(uk) & MASK_C2; break; case 1 : a+=get_unaligned_uint(uk) & MASK_C3; break; case 0 : return c & hash->size; /* zero length strings require no mixing */ } final(a,b,c); return c & hash->size; } static void hash_destroy(struct hash* hash) { if(hash) { if(hash->array) { free(hash->array); } if(hash->elems_pool) { pool_destroy(hash->elems_pool); } free(hash); } } static struct hash* hash_create(unsigned int size) { struct hash* hash; assert(size > 0); hash = calloc(1, sizeof(struct hash)); if(hash) { size += (size >> 2) + 1; /* ~ 75% load factor */ if(size >= 15) { hash->size = (((unsigned int)0xFFFFFFFF) >> clz((unsigned int)size)); } else { hash->size = size = 15; } hash->load_limit = hash->size - (hash->size >> 2); hash->used = 0; hash->array = (struct hash_elem**)calloc(hash->size + 1, sizeof(struct hash_elem*)); if(hash->array == NULL) { hash_destroy(hash); hash = NULL; } } if(hash) { hash->elems_pool = pool_create(sizeof(struct hash_elem), 0, size, size << 1); if(!hash->elems_pool) { hash_destroy(hash); hash = NULL; } } return hash; } static void hash_resize(struct hash* hash) { unsigned int old_size = hash->size; unsigned int hashed; struct hash_elem* hash_elem; struct hash_elem* next; struct hash_elem** array; int i; hash->size = (old_size << 1) + 1; /* we really should avoid to get there... so print a message to alert of the condition */ fprintf(stderr, "resize hash %d -> %d\n", old_size, hash->size); if(hash->size == old_size) { hash->flags |= HASH_F_NO_RESIZE; return; } array = calloc(hash->size + 1, sizeof(struct hash_elem*)); if(array) { hash->load_limit = hash->size - (hash->size >> 2); for(i=0; i <= old_size; i++) { hash_elem = (struct hash_elem*)hash->array[i]; while(hash_elem) { next = hash_elem->next; hashed = hash_compute(hash, hash_elem->key, hash_elem->key_len); hash_elem->next = array[hashed]; array[hashed] = hash_elem; hash_elem = next; } } free(hash->array); hash->array = (struct hash_elem**)array; } else { hash->size = old_size; hash->flags |= HASH_F_NO_RESIZE; } } #ifdef HASH_STAT static inline int compare_key(struct hash* hash, const char* a, const char* b, int len, int* cost) { *cost += 1; hash->memcmp += 1; return memcmp(a,b, len); } #else #define compare_key(h,a,b,l,c) memcmp(a,b,l) #endif /* a customized hash_store function that just store the key and return * TRUE if the key was effectively stored, or FALSE if the key was already there */ static int hash_store(struct hash* hash, const char* key, int key_len) { unsigned int hashed; struct hash_elem* hash_elem; int cost = 0; hashed = hash_compute(hash, key, key_len); #ifdef HASH_STAT hash->stored += 1; #endif hash_elem = (struct hash_elem*)hash->array[hashed]; while(hash_elem && (hash_elem->key_len != key_len || compare_key(hash, hash_elem->key, key, key_len, &cost))) { hash_elem = hash_elem->next; } if(!hash_elem) { hash_elem = pool_alloc(hash->elems_pool); if(hash_elem) { hash_elem->key = key; hash_elem->key_len = key_len; hash_elem->next = hash->array[hashed]; #ifdef HASH_STAT if(hash_elem->next) { hash->collisions += 1; hash->cost += cost; } #endif hash->array[hashed] = hash_elem; hash->used += 1; if(hash->used > hash->load_limit) { hash_resize(hash); } } return TRUE; } return FALSE; } static int file_stat(const char* name, struct stat* buffer_stat, int* rc) { int rc_local = 0; rc_local = stat(name, buffer_stat); if (rc_local < 0) { *rc = errno; } return rc_local; } static off_t file_get_size(const char* name, int* rc) { struct stat buffer_stat; off_t size = -1; if (!file_stat(name, &buffer_stat, rc)) { if(S_ISREG(buffer_stat.st_mode)) { size = buffer_stat.st_size; } else { *rc = EINVAL; } } return size; } static char* file_load(const char* name, off_t* size, int* return_rc) { off_t local_size = 0; int rc = 0; char* buffer = NULL; int fd; assert(name != NULL); if(!size) { size = &local_size; } *size = file_get_size(name, &rc); if (!rc) { fd = open(name, FILE_O_RDONLY | FILE_O_BINARY); if (!(fd == -1)) { buffer = malloc((size_t)(*size + 1)); if (buffer == NULL) { rc = ENOMEM; } else { ssize_t i; REDO: i = read(fd, buffer, (size_t)(*size)); if(i == -1) { if(errno == EINTR) { goto REDO; } else { rc = errno; } } else { if (i != *size) { rc = EIO; } } close(fd); buffer[*size] = 0; } } } if(rc && buffer) { free(buffer); buffer = NULL; } if(return_rc) { *return_rc = rc; } return buffer; } static void _cancel_relative(char* base, char** ref_cursor, char** ref_cursor_out, char* end) { char* cursor = *ref_cursor; char* cursor_out = *ref_cursor_out; do { cursor += 3; while(cursor_out > base && cursor_out[-1] == '/') cursor_out--; while(cursor_out > base && *--cursor_out != '/'); } while(cursor + 3 < end && !memcmp(cursor, "/../", 4)); *ref_cursor = cursor; *ref_cursor_out = cursor_out; } static inline void eat_space(char ** token) { while ((' ' == **token) || ('\t' == **token)) { ++(*token); } } /* prefix paths to absolute */ static inline void print_fullpaths(char* line) { char* token; char* end; token = line; eat_space(&token); while (*token) { end = token; while (*end && (' ' != *end) && ('\t' != *end)) { ++end; } if(*token == ':' || *token == '\\' || *token == '/' || *token == '$' || ':' == token[1]) { fwrite(token, end - token, 1, stdout); } else { fputs(base_dir_var, stdout); fputc('/', stdout); fwrite(token, end - token, 1, stdout); } fputc(' ', stdout); token = end; eat_space(&token); } } static int _process(struct hash* dep_hash, char* fn) { int rc; char* buffer; char* end; char* cursor; char* cursor_out; char* base; int continuation = 0; char last_ns = 0; off_t size; buffer = file_load(fn, &size, &rc); /* Note: yes we are going to leak 'buffer' * this is on purpose, to avoid cloning the 'key' out of it * and our special 'hash' just store the pointer to the key * inside of buffer, hence it need to remain allocated */ if(!rc) { base = cursor_out = cursor = end = buffer; end += size; while(cursor < end) { if(*cursor == '\\') { continuation = 1; *cursor_out++ = *cursor++; } else if(*cursor == '/') { if(cursor + 3 < end) { if(!memcmp(cursor, "/../", 4)) { _cancel_relative(base, &cursor, &cursor_out, end); } } *cursor_out++ = *cursor++; } else if(*cursor == '\n') { if(!continuation) { *cursor_out = 0; if(base < cursor) { /* here we have a complete rule */ if(last_ns == ':') { /* if the rule ended in ':' that is a no-dep rule * these are the one for which we want to filter * duplicate out */ if(hash_store(dep_hash, base, (int)(cursor_out - base))) { /* DO NOT modify base after it has been added as key by hash_store */ print_fullpaths(base); putc('\n', stdout); } } else { /* rule with dep, just write it */ print_fullpaths(base); putc('\n', stdout); } } cursor += 1; base = cursor_out = cursor; } else { /* here we have a '\' followed by \n this is a continuation * i.e not a complete rule yet */ *cursor_out++ = *cursor++; } } else { continuation = 0; /* not using isspace() here save 25% of I refs and 75% of D refs based on cachegrind */ if(*cursor != ' ' && *cursor != '\n' && *cursor != '\t' ) { last_ns = *cursor; } *cursor_out++ = *cursor++; } } /* just in case the file did not end with a \n, there may be a pending rule */ if(base < cursor_out) { if(last_ns == ':') { if(hash_store(dep_hash, base, (int)(cursor_out - base))) { puts(base); putc('\n', stdout); } } else { puts(base); putc('\n', stdout); } } } return rc; } static void _usage(void) { fputs("Usage: concat-deps \n", stderr); } #define kDEFAULT_HASH_SIZE 4096 int main(int argc, char** argv) { int rc = 0; off_t in_list_size = 0; char* in_list; char* in_list_cursor; char* in_list_base; struct hash* dep_hash; if(argc < 2) { _usage(); return 1; } base_dir = getenv("SRCDIR"); if(!base_dir) { fputs("Error: SRCDIR is missing in the environement\n", stderr); return 1; } in_list = file_load(argv[1], &in_list_size, &rc); if(!rc) { dep_hash = hash_create( kDEFAULT_HASH_SIZE); in_list_base = in_list_cursor = in_list; /* extract filename of dep file from a 'space' separated list */ while(*in_list_cursor) { if(*in_list_cursor == ' ' || *in_list_cursor == '\n') { *in_list_cursor = 0; if(in_list_base < in_list_cursor) { rc = _process(dep_hash, in_list_base); if(rc) { break; } } in_list_cursor += 1; in_list_base = in_list_cursor; } else { in_list_cursor += 1; } } if(!rc) { /* catch the last entry in case the input did not terminate with a 'space' */ if(in_list_base < in_list_cursor) { rc = _process(dep_hash, in_list_base); } } #ifdef HASH_STAT fprintf(stderr, "stats: u:%d s:%d l:%d t:%d c:%d m:%d $:%d\n", dep_hash->used, dep_hash->size, dep_hash->load_limit, dep_hash->stored, dep_hash->collisions, dep_hash->memcmp, dep_hash->cost); #endif } return rc; } /* vim:set shiftwidth=4 softtabstop=4 expandtab: */