5 files changed, 4697 insertions, 0 deletions
diff --git a/cvmx-malloc/README-malloc b/cvmx-malloc/README-malloc
new file mode 100644
index 000000000000..922a713410a9
--- /dev/null
+++ b/cvmx-malloc/README-malloc
@@ -0,0 +1,12 @@
+Readme for Octeon shared memory malloc
+
+This malloc is based on ptmalloc2, which is the malloc
+implementation of glibc.  Source code and more information
+on this can be found at http://www.malloc.de/en/index.html.
+Please see the individual files for licensing terms.
+
+The main change to the code modifies the way the malloc
+gets memory from the system.  Under Linux/Unix, malloc
+uses the brk or memmap sytem calls to request more memory.
+In this implementation, memory regions must be explicitly
+given to malloc by the application.
diff --git a/cvmx-malloc/arena.c b/cvmx-malloc/arena.c
new file mode 100644
index 000000000000..8e0ce1fe25fa
--- /dev/null
+++ b/cvmx-malloc/arena.c
@@ -0,0 +1,293 @@
+/*
+Copyright (c) 2001 Wolfram Gloger
+Copyright (c) 2006 Cavium networks
+
+Permission to use, copy, modify, distribute, and sell this software
+and its documentation for any purpose is hereby granted without fee,
+provided that (i) the above copyright notices and this permission
+notice appear in all copies of the software and related documentation,
+and (ii) the name of Wolfram Gloger may not be used in any advertising
+or publicity relating to the software.
+
+THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND,
+EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY
+WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+
+IN NO EVENT SHALL WOLFRAM GLOGER BE LIABLE FOR ANY SPECIAL,
+INCIDENTAL, INDIRECT OR CONSEQUENTIAL DAMAGES OF ANY KIND, OR ANY
+DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY
+OF LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+PERFORMANCE OF THIS SOFTWARE.
+*/
+
+/* $Id: arena.c 30481 2007-12-05 21:46:59Z rfranz $ */
+
+/* Compile-time constants.  */
+
+#define HEAP_MIN_SIZE (4096)   /* Must leave room for struct malloc_state, arena ptrs, etc., totals about 2400 bytes */
+
+#ifndef THREAD_STATS
+#define THREAD_STATS 0
+#endif
+
+/* If THREAD_STATS is non-zero, some statistics on mutex locking are
+   computed.  */
+
+/***************************************************************************/
+
+// made static to avoid conflicts with newlib
+static mstate         _int_new_arena __MALLOC_P ((size_t __ini_size));
+
+/***************************************************************************/
+
+#define top(ar_ptr) ((ar_ptr)->top)
+
+/* A heap is a single contiguous memory region holding (coalesceable)
+   malloc_chunks.    Not used unless compiling with
+   USE_ARENAS. */
+
+typedef struct _heap_info {
+  mstate ar_ptr; /* Arena for this heap. */
+  struct _heap_info *prev; /* Previous heap. */
+  size_t size;   /* Current size in bytes. */
+  size_t pad;    /* Make sure the following data is properly aligned. */
+} heap_info;
+
+/* Thread specific data */
+
+static tsd_key_t arena_key;  // one per PP (thread)
+static CVMX_SHARED mutex_t list_lock;  // shared...
+
+#if THREAD_STATS
+static int stat_n_heaps;
+#define THREAD_STAT(x) x
+#else
+#define THREAD_STAT(x) do ; while(0)
+#endif
+
+/* Mapped memory in non-main arenas (reliable only for NO_THREADS). */
+static unsigned long arena_mem;
+
+/* Already initialized? */
+int CVMX_SHARED cvmx__malloc_initialized = -1;
+
+/**************************************************************************/
+
+#if USE_ARENAS
+
+/* find the heap and corresponding arena for a given ptr */
+
+#define arena_for_chunk(ptr) ((ptr)->arena_ptr)
+#define set_arena_for_chunk(ptr, arena) (ptr)->arena_ptr = (arena)
+
+
+#endif /* USE_ARENAS */
+
+/**************************************************************************/
+
+#ifndef NO_THREADS
+
+/* atfork support.  */
+
+static __malloc_ptr_t (*save_malloc_hook) __MALLOC_P ((size_t __size,
+						       __const __malloc_ptr_t));
+static void           (*save_free_hook) __MALLOC_P ((__malloc_ptr_t __ptr,
+						     __const __malloc_ptr_t));
+static Void_t*        save_arena;
+
+/* Magic value for the thread-specific arena pointer when
+   malloc_atfork() is in use.  */
+
+#define ATFORK_ARENA_PTR ((Void_t*)-1)
+
+/* The following hooks are used while the `atfork' handling mechanism
+   is active. */
+
+static Void_t*
+malloc_atfork(size_t sz, const Void_t *caller)
+{
+return(NULL);
+}
+
+static void
+free_atfork(Void_t* mem, const Void_t *caller)
+{
+  Void_t *vptr = NULL;
+  mstate ar_ptr;
+  mchunkptr p;                          /* chunk corresponding to mem */
+
+  if (mem == 0)                              /* free(0) has no effect */
+    return;
+
+  p = mem2chunk(mem);         /* do not bother to replicate free_check here */
+
+#if HAVE_MMAP
+  if (chunk_is_mmapped(p))                       /* release mmapped memory. */
+  {
+    munmap_chunk(p);
+    return;
+  }
+#endif
+
+  ar_ptr = arena_for_chunk(p);
+  tsd_getspecific(arena_key, vptr);
+  if(vptr != ATFORK_ARENA_PTR)
+    (void)mutex_lock(&ar_ptr->mutex);
+  _int_free(ar_ptr, mem);
+  if(vptr != ATFORK_ARENA_PTR)
+    (void)mutex_unlock(&ar_ptr->mutex);
+}
+
+
+
+#ifdef __linux__
+#error   __linux__defined!
+#endif
+
+#endif /* !defined NO_THREADS */
+
+
+
+/* Initialization routine. */
+#ifdef _LIBC
+#error  _LIBC is defined, and should not be
+#endif /* _LIBC */
+
+static CVMX_SHARED cvmx_spinlock_t malloc_init_spin_lock;
+
+
+
+
+/* Managing heaps and arenas (for concurrent threads) */
+
+#if USE_ARENAS
+
+#if MALLOC_DEBUG > 1
+
+/* Print the complete contents of a single heap to stderr. */
+
+static void
+#if __STD_C
+dump_heap(heap_info *heap)
+#else
+dump_heap(heap) heap_info *heap;
+#endif
+{
+  char *ptr;
+  mchunkptr p;
+
+  fprintf(stderr, "Heap %p, size %10lx:\n", heap, (long)heap->size);
+  ptr = (heap->ar_ptr != (mstate)(heap+1)) ?
+    (char*)(heap + 1) : (char*)(heap + 1) + sizeof(struct malloc_state);
+  p = (mchunkptr)(((unsigned long)ptr + MALLOC_ALIGN_MASK) &
+                  ~MALLOC_ALIGN_MASK);
+  for(;;) {
+    fprintf(stderr, "chunk %p size %10lx", p, (long)p->size);
+    if(p == top(heap->ar_ptr)) {
+      fprintf(stderr, " (top)\n");
+      break;
+    } else if(p->size == (0|PREV_INUSE)) {
+      fprintf(stderr, " (fence)\n");
+      break;
+    }
+    fprintf(stderr, "\n");
+    p = next_chunk(p);
+  }
+}
+
+#endif /* MALLOC_DEBUG > 1 */
+/* Delete a heap. */
+
+
+static mstate cvmx_new_arena(void *addr, size_t size)
+{
+  mstate a;
+  heap_info *h;
+  char *ptr;
+  unsigned long misalign;
+  int page_mask = malloc_getpagesize - 1;
+
+  debug_printf("cvmx_new_arena called, addr: %p, size %ld\n", addr, size);
+  debug_printf("heapinfo size: %ld, mstate size: %d\n", sizeof(heap_info), sizeof(struct malloc_state));
+
+  if (!addr || (size < HEAP_MIN_SIZE))
+  {
+      return(NULL);
+  }
+  /* We must zero out the arena as the malloc code assumes this. */
+  memset(addr, 0, size);
+
+  h = (heap_info *)addr;
+  h->size = size;
+
+  a = h->ar_ptr = (mstate)(h+1);
+  malloc_init_state(a);
+  /*a->next = NULL;*/
+  a->system_mem = a->max_system_mem = h->size;
+  arena_mem += h->size;
+  a->next = a;
+
+  /* Set up the top chunk, with proper alignment. */
+  ptr = (char *)(a + 1);
+  misalign = (unsigned long)chunk2mem(ptr) & MALLOC_ALIGN_MASK;
+  if (misalign > 0)
+    ptr += MALLOC_ALIGNMENT - misalign;
+  top(a) = (mchunkptr)ptr;
+  set_head(top(a), (((char*)h + h->size) - ptr) | PREV_INUSE);
+
+  return a;
+}
+
+
+int cvmx_add_arena(cvmx_arena_list_t *arena_list, void *ptr, size_t size)
+{
+  mstate a;
+
+  /* Enforce required alignement, and adjust size */
+  int misaligned = ((size_t)ptr) & (MALLOC_ALIGNMENT - 1);
+  if (misaligned)
+  {
+      ptr = (char*)ptr + MALLOC_ALIGNMENT - misaligned;
+      size -= MALLOC_ALIGNMENT - misaligned;
+  }
+
+  debug_printf("Adding arena at addr: %p, size %d\n", ptr, size);
+
+  a = cvmx_new_arena(ptr, size);  /* checks ptr and size */
+  if (!a)
+  {
+      return(-1);
+  }
+
+  debug_printf("cmvx_add_arena - arena_list: %p, *arena_list: %p\n", arena_list, *arena_list);
+  debug_printf("cmvx_add_arena - list: %p, new: %p\n", *arena_list, a);
+  mutex_init(&a->mutex);
+  mutex_lock(&a->mutex);
+
+
+  if (*arena_list)
+  {
+      mstate ar_ptr = *arena_list;
+      (void)mutex_lock(&ar_ptr->mutex);
+      a->next = ar_ptr->next;  // lock held on a and ar_ptr
+      ar_ptr->next = a;
+      (void)mutex_unlock(&ar_ptr->mutex);
+  }
+  else
+  {
+      *arena_list = a;
+//      a->next = a;
+  }
+
+  debug_printf("cvmx_add_arena - list: %p, list->next: %p\n", *arena_list, ((mstate)*arena_list)->next);
+
+  // unlock, since it is not going to be used immediately
+  (void)mutex_unlock(&a->mutex);
+
+  return(0);
+}
+
+
+
+#endif /* USE_ARENAS */
diff --git a/cvmx-malloc/malloc.c b/cvmx-malloc/malloc.c
new file mode 100644
index 000000000000..222ad5def124
--- /dev/null
+++ b/cvmx-malloc/malloc.c
@@ -0,0 +1,4106 @@
+/*
+Copyright (c) 2001 Wolfram Gloger
+Copyright (c) 2006 Cavium networks
+
+Permission to use, copy, modify, distribute, and sell this software
+and its documentation for any purpose is hereby granted without fee,
+provided that (i) the above copyright notices and this permission
+notice appear in all copies of the software and related documentation,
+and (ii) the name of Wolfram Gloger may not be used in any advertising
+or publicity relating to the software.
+
+THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND,
+EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY
+WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+
+IN NO EVENT SHALL WOLFRAM GLOGER BE LIABLE FOR ANY SPECIAL,
+INCIDENTAL, INDIRECT OR CONSEQUENTIAL DAMAGES OF ANY KIND, OR ANY
+DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY
+OF LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+PERFORMANCE OF THIS SOFTWARE.
+*/
+
+/*
+  This is a version (aka ptmalloc2) of malloc/free/realloc written by
+  Doug Lea and adapted to multiple threads/arenas by Wolfram Gloger.
+
+* Version ptmalloc2-20011215
+  $Id: malloc.c 30481 2007-12-05 21:46:59Z rfranz $
+  based on:
+  VERSION 2.7.1pre1 Sat May 12 07:41:21 2001  Doug Lea  (dl at gee)
+
+   Note: There may be an updated version of this malloc obtainable at
+           http://www.malloc.de/malloc/ptmalloc2.tar.gz
+         Check before installing!
+
+* Quickstart
+
+  In order to compile this implementation, a Makefile is provided with
+  the ptmalloc2 distribution, which has pre-defined targets for some
+  popular systems (e.g. "make posix" for Posix threads).  All that is
+  typically required with regard to compiler flags is the selection of
+  the thread package via defining one out of USE_PTHREADS, USE_THR or
+  USE_SPROC.  Check the thread-m.h file for what effects this has.
+  Many/most systems will additionally require USE_TSD_DATA_HACK to be
+  defined, so this is the default for "make posix".
+
+* Why use this malloc?
+
+  This is not the fastest, most space-conserving, most portable, or
+  most tunable malloc ever written. However it is among the fastest
+  while also being among the most space-conserving, portable and tunable.
+  Consistent balance across these factors results in a good general-purpose
+  allocator for malloc-intensive programs.
+
+  The main properties of the algorithms are:
+  * For large (>= 512 bytes) requests, it is a pure best-fit allocator,
+    with ties normally decided via FIFO (i.e. least recently used).
+  * For small (<= 64 bytes by default) requests, it is a caching
+    allocator, that maintains pools of quickly recycled chunks.
+  * In between, and for combinations of large and small requests, it does
+    the best it can trying to meet both goals at once.
+  * For very large requests (>= 128KB by default), it relies on system
+    memory mapping facilities, if supported.
+
+  For a longer but slightly out of date high-level description, see
+     http://gee.cs.oswego.edu/dl/html/malloc.html
+
+  You may already by default be using a C library containing a malloc
+  that is  based on some version of this malloc (for example in
+  linux). You might still want to use the one in this file in order to
+  customize settings or to avoid overheads associated with library
+  versions.
+
+* Contents, described in more detail in "description of public routines" below.
+
+  Standard (ANSI/SVID/...)  functions:
+    malloc(size_t n);
+    calloc(size_t n_elements, size_t element_size);
+    free(Void_t* p);
+    realloc(Void_t* p, size_t n);
+    memalign(size_t alignment, size_t n);
+    valloc(size_t n);
+    mallinfo()
+    mallopt(int parameter_number, int parameter_value)
+
+  Additional functions:
+    independent_calloc(size_t n_elements, size_t size, Void_t* chunks[]);
+    independent_comalloc(size_t n_elements, size_t sizes[], Void_t* chunks[]);
+    pvalloc(size_t n);
+    cfree(Void_t* p);
+    malloc_trim(size_t pad);
+    malloc_usable_size(Void_t* p);
+    malloc_stats();
+
+* Vital statistics:
+
+  Supported pointer representation:       4 or 8 bytes
+  Supported size_t  representation:       4 or 8 bytes
+       Note that size_t is allowed to be 4 bytes even if pointers are 8.
+       You can adjust this by defining INTERNAL_SIZE_T
+
+  Alignment:                              2 * sizeof(size_t) (default)
+       (i.e., 8 byte alignment with 4byte size_t). This suffices for
+       nearly all current machines and C compilers. However, you can
+       define MALLOC_ALIGNMENT to be wider than this if necessary.
+
+  Minimum overhead per allocated chunk:   4 or 8 bytes
+       Each malloced chunk has a hidden word of overhead holding size
+       and status information.
+
+  Minimum allocated size: 4-byte ptrs:  16 bytes    (including 4 overhead)
+                          8-byte ptrs:  24/32 bytes (including, 4/8 overhead)
+
+       When a chunk is freed, 12 (for 4byte ptrs) or 20 (for 8 byte
+       ptrs but 4 byte size) or 24 (for 8/8) additional bytes are
+       needed; 4 (8) for a trailing size field and 8 (16) bytes for
+       free list pointers. Thus, the minimum allocatable size is
+       16/24/32 bytes.
+
+       Even a request for zero bytes (i.e., malloc(0)) returns a
+       pointer to something of the minimum allocatable size.
+
+       The maximum overhead wastage (i.e., number of extra bytes
+       allocated than were requested in malloc) is less than or equal
+       to the minimum size, except for requests >= mmap_threshold that
+       are serviced via mmap(), where the worst case wastage is 2 *
+       sizeof(size_t) bytes plus the remainder from a system page (the
+       minimal mmap unit); typically 4096 or 8192 bytes.
+
+  Maximum allocated size:  4-byte size_t: 2^32 minus about two pages
+                           8-byte size_t: 2^64 minus about two pages
+
+       It is assumed that (possibly signed) size_t values suffice to
+       represent chunk sizes. `Possibly signed' is due to the fact
+       that `size_t' may be defined on a system as either a signed or
+       an unsigned type. The ISO C standard says that it must be
+       unsigned, but a few systems are known not to adhere to this.
+       Additionally, even when size_t is unsigned, sbrk (which is by
+       default used to obtain memory from system) accepts signed
+       arguments, and may not be able to handle size_t-wide arguments
+       with negative sign bit.  Generally, values that would
+       appear as negative after accounting for overhead and alignment
+       are supported only via mmap(), which does not have this
+       limitation.
+
+       Requests for sizes outside the allowed range will perform an optional
+       failure action and then return null. (Requests may also
+       also fail because a system is out of memory.)
+
+  Thread-safety: thread-safe unless NO_THREADS is defined
+
+  Compliance: I believe it is compliant with the 1997 Single Unix Specification
+       (See http://www.opennc.org). Also SVID/XPG, ANSI C, and probably
+       others as well.
+
+* Synopsis of compile-time options:
+
+    People have reported using previous versions of this malloc on all
+    versions of Unix, sometimes by tweaking some of the defines
+    below. It has been tested most extensively on Solaris and
+    Linux. It is also reported to work on WIN32 platforms.
+    People also report using it in stand-alone embedded systems.
+
+    The implementation is in straight, hand-tuned ANSI C.  It is not
+    at all modular. (Sorry!)  It uses a lot of macros.  To be at all
+    usable, this code should be compiled using an optimizing compiler
+    (for example gcc -O3) that can simplify expressions and control
+    paths. (FAQ: some macros import variables as arguments rather than
+    declare locals because people reported that some debuggers
+    otherwise get confused.)
+
+    OPTION                     DEFAULT VALUE
+
+    Compilation Environment options:
+
+    __STD_C                    derived from C compiler defines
+    WIN32                      NOT defined
+    HAVE_MEMCPY                defined
+    USE_MEMCPY                 1 if HAVE_MEMCPY is defined
+    HAVE_MMAP                  defined as 1
+    MMAP_CLEARS                1
+    HAVE_MREMAP                0 unless linux defined
+    USE_ARENAS                 the same as HAVE_MMAP
+    malloc_getpagesize         derived from system #includes, or 4096 if not
+    HAVE_USR_INCLUDE_MALLOC_H  NOT defined
+    LACKS_UNISTD_H             NOT defined unless WIN32
+    LACKS_SYS_PARAM_H          NOT defined unless WIN32
+    LACKS_SYS_MMAN_H           NOT defined unless WIN32
+
+    Changing default word sizes:
+
+    INTERNAL_SIZE_T            size_t
+    MALLOC_ALIGNMENT           2 * sizeof(INTERNAL_SIZE_T)
+
+    Configuration and functionality options:
+
+    USE_DL_PREFIX              NOT defined
+    USE_PUBLIC_MALLOC_WRAPPERS NOT defined
+    USE_MALLOC_LOCK            NOT defined
+    MALLOC_DEBUG               NOT defined
+    REALLOC_ZERO_BYTES_FREES   1
+    MALLOC_FAILURE_ACTION      errno = ENOMEM, if __STD_C defined, else no-op
+    TRIM_FASTBINS              0
+    FIRST_SORTED_BIN_SIZE      512
+
+    Options for customizing MORECORE:
+
+    MORECORE                   sbrk
+    MORECORE_FAILURE           -1
+    MORECORE_CONTIGUOUS        1
+    MORECORE_CANNOT_TRIM       NOT defined
+    MORECORE_CLEARS            1
+    MMAP_AS_MORECORE_SIZE      (1024 * 1024)
+
+    Tuning options that are also dynamically changeable via mallopt:
+
+    DEFAULT_MXFAST             64
+    DEFAULT_TRIM_THRESHOLD     128 * 1024
+    DEFAULT_TOP_PAD            0
+    DEFAULT_MMAP_THRESHOLD     128 * 1024
+    DEFAULT_MMAP_MAX           65536
+
+    There are several other #defined constants and macros that you
+    probably don't want to touch unless you are extending or adapting malloc.  */
+
+/*
+  __STD_C should be nonzero if using ANSI-standard C compiler, a C++
+  compiler, or a C compiler sufficiently close to ANSI to get away
+  with it.
+*/
+
+#include "cvmx-config.h"
+#include "cvmx.h"
+#include "cvmx-spinlock.h"
+#include "cvmx-malloc.h"
+
+
+#ifndef __STD_C
+#if defined(__STDC__) || defined(__cplusplus)
+#define __STD_C     1
+#else
+#define __STD_C     0
+#endif
+#endif /*__STD_C*/
+
+
+/*
+  Void_t* is the pointer type that malloc should say it returns
+*/
+
+#ifndef Void_t
+#if 1
+#define Void_t      void
+#else
+#define Void_t      char
+#endif
+#endif /*Void_t*/
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* define LACKS_UNISTD_H if your system does not have a <unistd.h>. */
+
+/* #define  LACKS_UNISTD_H */
+
+#ifndef LACKS_UNISTD_H
+#include <unistd.h>
+#endif
+
+/* define LACKS_SYS_PARAM_H if your system does not have a <sys/param.h>. */
+
+/* #define  LACKS_SYS_PARAM_H */
+
+
+#include <stdio.h>    /* needed for malloc_stats */
+#include <errno.h>    /* needed for optional MALLOC_FAILURE_ACTION */
+
+
+/*
+  Debugging:
+
+  Because freed chunks may be overwritten with bookkeeping fields, this
+  malloc will often die when freed memory is overwritten by user
+  programs.  This can be very effective (albeit in an annoying way)
+  in helping track down dangling pointers.
+
+  If you compile with -DMALLOC_DEBUG, a number of assertion checks are
+  enabled that will catch more memory errors. You probably won't be
+  able to make much sense of the actual assertion errors, but they
+  should help you locate incorrectly overwritten memory.  The checking
+  is fairly extensive, and will slow down execution
+  noticeably. Calling malloc_stats or mallinfo with MALLOC_DEBUG set
+  will attempt to check every non-mmapped allocated and free chunk in
+  the course of computing the summmaries. (By nature, mmapped regions
+  cannot be checked very much automatically.)
+
+  Setting MALLOC_DEBUG may also be helpful if you are trying to modify
+  this code. The assertions in the check routines spell out in more
+  detail the assumptions and invariants underlying the algorithms.
+
+  Setting MALLOC_DEBUG does NOT provide an automated mechanism for
+  checking that all accesses to malloced memory stay within their
+  bounds. However, there are several add-ons and adaptations of this
+  or other mallocs available that do this.
+*/
+
+#define MALLOC_DEBUG 1
+#if MALLOC_DEBUG
+#include <assert.h>
+#else
+#define assert(x) ((void)0)
+#endif
+
+
+/*
+  INTERNAL_SIZE_T is the word-size used for internal bookkeeping
+  of chunk sizes.
+
+  The default version is the same as size_t.
+
+  While not strictly necessary, it is best to define this as an
+  unsigned type, even if size_t is a signed type. This may avoid some
+  artificial size limitations on some systems.
+
+  On a 64-bit machine, you may be able to reduce malloc overhead by
+  defining INTERNAL_SIZE_T to be a 32 bit `unsigned int' at the
+  expense of not being able to handle more than 2^32 of malloced
+  space. If this limitation is acceptable, you are encouraged to set
+  this unless you are on a platform requiring 16byte alignments. In
+  this case the alignment requirements turn out to negate any
+  potential advantages of decreasing size_t word size.
+
+  Implementors: Beware of the possible combinations of:
+     - INTERNAL_SIZE_T might be signed or unsigned, might be 32 or 64 bits,
+       and might be the same width as int or as long
+     - size_t might have different width and signedness as INTERNAL_SIZE_T
+     - int and long might be 32 or 64 bits, and might be the same width
+  To deal with this, most comparisons and difference computations
+  among INTERNAL_SIZE_Ts should cast them to unsigned long, being
+  aware of the fact that casting an unsigned int to a wider long does
+  not sign-extend. (This also makes checking for negative numbers
+  awkward.) Some of these casts result in harmless compiler warnings
+  on some systems.
+*/
+
+#ifndef INTERNAL_SIZE_T
+#define INTERNAL_SIZE_T size_t
+#endif
+
+/* The corresponding word size */
+#define SIZE_SZ                (sizeof(INTERNAL_SIZE_T))
+
+
+/*
+  MALLOC_ALIGNMENT is the minimum alignment for malloc'ed chunks.
+  It must be a power of two at least 2 * SIZE_SZ, even on machines
+  for which smaller alignments would suffice. It may be defined as
+  larger than this though. Note however that code and data structures
+  are optimized for the case of 8-byte alignment.
+*/
+
+
+#ifndef MALLOC_ALIGNMENT
+#define MALLOC_ALIGNMENT       (2 * SIZE_SZ)
+#endif
+
+/* The corresponding bit mask value */
+#define MALLOC_ALIGN_MASK      (MALLOC_ALIGNMENT - 1)
+
+
+
+/*
+  REALLOC_ZERO_BYTES_FREES should be set if a call to
+  realloc with zero bytes should be the same as a call to free.
+  This is required by the C standard. Otherwise, since this malloc
+  returns a unique pointer for malloc(0), so does realloc(p, 0).
+*/
+
+#ifndef REALLOC_ZERO_BYTES_FREES
+#define REALLOC_ZERO_BYTES_FREES 1
+#endif
+
+/*
+  TRIM_FASTBINS controls whether free() of a very small chunk can
+  immediately lead to trimming. Setting to true (1) can reduce memory
+  footprint, but will almost always slow down programs that use a lot
+  of small chunks.
+
+  Define this only if you are willing to give up some speed to more
+  aggressively reduce system-level memory footprint when releasing
+  memory in programs that use many small chunks.  You can get
+  essentially the same effect by setting MXFAST to 0, but this can
+  lead to even greater slowdowns in programs using many small chunks.
+  TRIM_FASTBINS is an in-between compile-time option, that disables
+  only those chunks bordering topmost memory from being placed in
+  fastbins.
+*/
+
+#ifndef TRIM_FASTBINS
+#define TRIM_FASTBINS  0
+#endif
+
+
+/*
+  USE_DL_PREFIX will prefix all public routines with the string 'dl'.
+  This is necessary when you only want to use this malloc in one part
+  of a program, using your regular system malloc elsewhere.
+*/
+
+#define USE_DL_PREFIX
+
+
+/*
+   Two-phase name translation.
+   All of the actual routines are given mangled names.
+   When wrappers are used, they become the public callable versions.
+   When DL_PREFIX is used, the callable names are prefixed.
+*/
+
+#ifdef USE_DL_PREFIX
+#define public_cALLOc    cvmx_calloc
+#define public_fREe      cvmx_free
+#define public_cFREe     dlcfree
+#define public_mALLOc    cvmx_malloc
+#define public_mEMALIGn  cvmx_memalign
+#define public_rEALLOc   cvmx_realloc
+#define public_vALLOc    dlvalloc
+#define public_pVALLOc   dlpvalloc
+#define public_mALLINFo  dlmallinfo
+#define public_mALLOPt   dlmallopt
+#define public_mTRIm     dlmalloc_trim
+#define public_mSTATs    dlmalloc_stats
+#define public_mUSABLe   dlmalloc_usable_size
+#define public_iCALLOc   dlindependent_calloc
+#define public_iCOMALLOc dlindependent_comalloc
+#define public_gET_STATe dlget_state
+#define public_sET_STATe dlset_state
+#else /* USE_DL_PREFIX */
+#ifdef _LIBC
+#error _LIBC defined and should not be
+/* Special defines for the GNU C library.  */
+#define public_cALLOc    __libc_calloc
+#define public_fREe      __libc_free
+#define public_cFREe     __libc_cfree
+#define public_mALLOc    __libc_malloc
+#define public_mEMALIGn  __libc_memalign
+#define public_rEALLOc   __libc_realloc
+#define public_vALLOc    __libc_valloc
+#define public_pVALLOc   __libc_pvalloc
+#define public_mALLINFo  __libc_mallinfo
+#define public_mALLOPt   __libc_mallopt
+#define public_mTRIm     __malloc_trim
+#define public_mSTATs    __malloc_stats
+#define public_mUSABLe   __malloc_usable_size
+#define public_iCALLOc   __libc_independent_calloc
+#define public_iCOMALLOc __libc_independent_comalloc
+#define public_gET_STATe __malloc_get_state
+#define public_sET_STATe __malloc_set_state
+#define malloc_getpagesize __getpagesize()
+#define open             __open
+#define mmap             __mmap
+#define munmap           __munmap
+#define mremap           __mremap
+#define mprotect         __mprotect
+#define MORECORE         (*__morecore)
+#define MORECORE_FAILURE 0
+
+Void_t * __default_morecore (ptrdiff_t);
+Void_t *(*__morecore)(ptrdiff_t) = __default_morecore;
+
+#else /* !_LIBC */
+#define public_cALLOc    calloc
+#define public_fREe      free
+#define public_cFREe     cfree
+#define public_mALLOc    malloc
+#define public_mEMALIGn  memalign
+#define public_rEALLOc   realloc
+#define public_vALLOc    valloc
+#define public_pVALLOc   pvalloc
+#define public_mALLINFo  mallinfo
+#define public_mALLOPt   mallopt
+#define public_mTRIm     malloc_trim
+#define public_mSTATs    malloc_stats
+#define public_mUSABLe   malloc_usable_size
+#define public_iCALLOc   independent_calloc
+#define public_iCOMALLOc independent_comalloc
+#define public_gET_STATe malloc_get_state
+#define public_sET_STATe malloc_set_state
+#endif /* _LIBC */
+#endif /* USE_DL_PREFIX */
+
+
+/*
+  HAVE_MEMCPY should be defined if you are not otherwise using
+  ANSI STD C, but still have memcpy and memset in your C library
+  and want to use them in calloc and realloc. Otherwise simple
+  macro versions are defined below.
+
+  USE_MEMCPY should be defined as 1 if you actually want to
+  have memset and memcpy called. People report that the macro
+  versions are faster than libc versions on some systems.
+
+  Even if USE_MEMCPY is set to 1, loops to copy/clear small chunks
+  (of <= 36 bytes) are manually unrolled in realloc and calloc.
+*/
+
+#define HAVE_MEMCPY
+
+#ifndef USE_MEMCPY
+#ifdef HAVE_MEMCPY
+#define USE_MEMCPY 1
+#else
+#define USE_MEMCPY 0
+#endif
+#endif
+
+
+#if (__STD_C || defined(HAVE_MEMCPY))
+
+#ifdef WIN32
+/* On Win32 memset and memcpy are already declared in windows.h */
+#else
+#if __STD_C
+void* memset(void*, int, size_t);
+void* memcpy(void*, const void*, size_t);
+#else
+Void_t* memset();
+Void_t* memcpy();
+#endif
+#endif
+#endif
+
+/*
+  MALLOC_FAILURE_ACTION is the action to take before "return 0" when
+  malloc fails to be able to return memory, either because memory is
+  exhausted or because of illegal arguments.
+
+  By default, sets errno if running on STD_C platform, else does nothing.
+*/
+
+#ifndef MALLOC_FAILURE_ACTION
+#if __STD_C
+#define MALLOC_FAILURE_ACTION \
+   errno = ENOMEM;
+
+#else
+#define MALLOC_FAILURE_ACTION
+#endif
+#endif
+
+/*
+  MORECORE-related declarations. By default, rely on sbrk
+*/
+
+
+#ifdef LACKS_UNISTD_H
+#if !defined(__FreeBSD__) && !defined(__OpenBSD__) && !defined(__NetBSD__)
+#if __STD_C
+extern Void_t*     sbrk(ptrdiff_t);
+#else
+extern Void_t*     sbrk();
+#endif
+#endif
+#endif
+
+/*
+  MORECORE is the name of the routine to call to obtain more memory
+  from the system.  See below for general guidance on writing
+  alternative MORECORE functions, as well as a version for WIN32 and a
+  sample version for pre-OSX macos.
+*/
+#undef MORECORE  // not supported
+#ifndef MORECORE
+#define MORECORE notsupported
+#endif
+
+/*
+  MORECORE_FAILURE is the value returned upon failure of MORECORE
+  as well as mmap. Since it cannot be an otherwise valid memory address,
+  and must reflect values of standard sys calls, you probably ought not
+  try to redefine it.
+*/
+
+#ifndef MORECORE_FAILURE
+#define MORECORE_FAILURE (-1)
+#endif
+
+/*
+  If MORECORE_CONTIGUOUS is true, take advantage of fact that
+  consecutive calls to MORECORE with positive arguments always return
+  contiguous increasing addresses.  This is true of unix sbrk.  Even
+  if not defined, when regions happen to be contiguous, malloc will
+  permit allocations spanning regions obtained from different
+  calls. But defining this when applicable enables some stronger
+  consistency checks and space efficiencies.
+*/
+
+#ifndef MORECORE_CONTIGUOUS
+#define MORECORE_CONTIGUOUS 0
+#endif
+
+/*
+  Define MORECORE_CANNOT_TRIM if your version of MORECORE
+  cannot release space back to the system when given negative
+  arguments. This is generally necessary only if you are using
+  a hand-crafted MORECORE function that cannot handle negative arguments.
+*/
+
+#define MORECORE_CANNOT_TRIM 1
+
+/*  MORECORE_CLEARS           (default 1)
+     The degree to which the routine mapped to MORECORE zeroes out
+     memory: never (0), only for newly allocated space (1) or always
+     (2).  The distinction between (1) and (2) is necessary because on
+     some systems, if the application first decrements and then
+     increments the break value, the contents of the reallocated space
+     are unspecified.
+*/
+
+#ifndef MORECORE_CLEARS
+#define MORECORE_CLEARS 0
+#endif
+
+
+/*
+  Define HAVE_MMAP as true to optionally make malloc() use mmap() to
+  allocate very large blocks.  These will be returned to the
+  operating system immediately after a free(). Also, if mmap
+  is available, it is used as a backup strategy in cases where
+  MORECORE fails to provide space from system.
+
+  This malloc is best tuned to work with mmap for large requests.
+  If you do not have mmap, operations involving very large chunks (1MB
+  or so) may be slower than you'd like.
+*/
+
+#undef HAVE_MMAP
+#ifndef HAVE_MMAP
+#define HAVE_MMAP 0
+
+/*
+   Standard unix mmap using /dev/zero clears memory so calloc doesn't
+   need to.
+*/
+
+#ifndef MMAP_CLEARS
+#define MMAP_CLEARS 0
+#endif
+
+#else /* no mmap */
+#ifndef MMAP_CLEARS
+#define MMAP_CLEARS 0
+#endif
+#endif
+
+
+/*
+   MMAP_AS_MORECORE_SIZE is the minimum mmap size argument to use if
+   sbrk fails, and mmap is used as a backup (which is done only if
+   HAVE_MMAP).  The value must be a multiple of page size.  This
+   backup strategy generally applies only when systems have "holes" in
+   address space, so sbrk cannot perform contiguous expansion, but
+   there is still space available on system.  On systems for which
+   this is known to be useful (i.e. most linux kernels), this occurs
+   only when programs allocate huge amounts of memory.  Between this,
+   and the fact that mmap regions tend to be limited, the size should
+   be large, to avoid too many mmap calls and thus avoid running out
+   of kernel resources.
+*/
+
+#ifndef MMAP_AS_MORECORE_SIZE
+#define MMAP_AS_MORECORE_SIZE (1024 * 1024)
+#endif
+
+/*
+  Define HAVE_MREMAP to make realloc() use mremap() to re-allocate
+  large blocks.  This is currently only possible on Linux with
+  kernel versions newer than 1.3.77.
+*/
+#undef linux
+#ifndef HAVE_MREMAP
+#ifdef linux
+#define HAVE_MREMAP 1
+#else
+#define HAVE_MREMAP 0
+#endif
+
+#endif /* HAVE_MMAP */
+
+/* Define USE_ARENAS to enable support for multiple `arenas'.  These
+   are allocated using mmap(), are necessary for threads and
+   occasionally useful to overcome address space limitations affecting
+   sbrk(). */
+
+#ifndef USE_ARENAS
+#define USE_ARENAS 1  // we 'manually' mmap the arenas.....
+#endif
+
+
+/*
+  The system page size. To the extent possible, this malloc manages
+  memory from the system in page-size units.  Note that this value is
+  cached during initialization into a field of malloc_state. So even
+  if malloc_getpagesize is a function, it is only called once.
+
+  The following mechanics for getpagesize were adapted from bsd/gnu
+  getpagesize.h. If none of the system-probes here apply, a value of
+  4096 is used, which should be OK: If they don't apply, then using
+  the actual value probably doesn't impact performance.
+*/
+
+
+#define malloc_getpagesize (4096)
+#ifndef malloc_getpagesize
+
+#ifndef LACKS_UNISTD_H
+#  include <unistd.h>
+#endif
+
+#  ifdef _SC_PAGESIZE         /* some SVR4 systems omit an underscore */
+#    ifndef _SC_PAGE_SIZE
+#      define _SC_PAGE_SIZE _SC_PAGESIZE
+#    endif
+#  endif
+
+#  ifdef _SC_PAGE_SIZE
+#    define malloc_getpagesize sysconf(_SC_PAGE_SIZE)
+#  else
+#    if defined(BSD) || defined(DGUX) || defined(HAVE_GETPAGESIZE)
+       extern size_t getpagesize();
+#      define malloc_getpagesize getpagesize()
+#    else
+#      ifdef WIN32 /* use supplied emulation of getpagesize */
+#        define malloc_getpagesize getpagesize()
+#      else
+#        ifndef LACKS_SYS_PARAM_H
+#          include <sys/param.h>
+#        endif
+#        ifdef EXEC_PAGESIZE
+#          define malloc_getpagesize EXEC_PAGESIZE
+#        else
+#          ifdef NBPG
+#            ifndef CLSIZE
+#              define malloc_getpagesize NBPG
+#            else
+#              define malloc_getpagesize (NBPG * CLSIZE)
+#            endif
+#          else
+#            ifdef NBPC
+#              define malloc_getpagesize NBPC
+#            else
+#              ifdef PAGESIZE
+#                define malloc_getpagesize PAGESIZE
+#              else /* just guess */
+#                define malloc_getpagesize (4096)
+#              endif
+#            endif
+#          endif
+#        endif
+#      endif
+#    endif
+#  endif
+#endif
+
+/*
+  This version of malloc supports the standard SVID/XPG mallinfo
+  routine that returns a struct containing usage properties and
+  statistics. It should work on any SVID/XPG compliant system that has
+  a /usr/include/malloc.h defining struct mallinfo. (If you'd like to
+  install such a thing yourself, cut out the preliminary declarations
+  as described above and below and save them in a malloc.h file. But
+  there's no compelling reason to bother to do this.)
+
+  The main declaration needed is the mallinfo struct that is returned
+  (by-copy) by mallinfo().  The SVID/XPG malloinfo struct contains a
+  bunch of fields that are not even meaningful in this version of
+  malloc.  These fields are are instead filled by mallinfo() with
+  other numbers that might be of interest.
+
+  HAVE_USR_INCLUDE_MALLOC_H should be set if you have a
+  /usr/include/malloc.h file that includes a declaration of struct
+  mallinfo.  If so, it is included; else an SVID2/XPG2 compliant
+  version is declared below.  These must be precisely the same for
+  mallinfo() to work.  The original SVID version of this struct,
+  defined on most systems with mallinfo, declares all fields as
+  ints. But some others define as unsigned long. If your system
+  defines the fields using a type of different width than listed here,
+  you must #include your system version and #define
+  HAVE_USR_INCLUDE_MALLOC_H.
+*/
+
+/* #define HAVE_USR_INCLUDE_MALLOC_H */
+
+#ifdef HAVE_USR_INCLUDE_MALLOC_H
+#include "/usr/include/malloc.h"
+#endif
+
+
+/* ---------- description of public routines ------------ */
+
+/*
+  malloc(size_t n)
+  Returns a pointer to a newly allocated chunk of at least n bytes, or null
+  if no space is available. Additionally, on failure, errno is
+  set to ENOMEM on ANSI C systems.
+
+  If n is zero, malloc returns a minumum-sized chunk. (The minimum
+  size is 16 bytes on most 32bit systems, and 24 or 32 bytes on 64bit
+  systems.)  On most systems, size_t is an unsigned type, so calls
+  with negative arguments are interpreted as requests for huge amounts
+  of space, which will often fail. The maximum supported value of n
+  differs across systems, but is in all cases less than the maximum
+  representable value of a size_t.
+*/
+#if __STD_C
+Void_t*  public_mALLOc(cvmx_arena_list_t arena_list, size_t);
+#else
+Void_t*  public_mALLOc();
+#endif
+
+/*
+  free(Void_t* p)
+  Releases the chunk of memory pointed to by p, that had been previously
+  allocated using malloc or a related routine such as realloc.
+  It has no effect if p is null. It can have arbitrary (i.e., bad!)
+  effects if p has already been freed.
+
+  Unless disabled (using mallopt), freeing very large spaces will
+  when possible, automatically trigger operations that give
+  back unused memory to the system, thus reducing program footprint.
+*/
+#if __STD_C
+void     public_fREe(Void_t*);
+#else
+void     public_fREe();
+#endif
+
+/*
+  calloc(size_t n_elements, size_t element_size);
+  Returns a pointer to n_elements * element_size bytes, with all locations
+  set to zero.
+*/
+#if __STD_C
+Void_t*  public_cALLOc(cvmx_arena_list_t arena_list, size_t, size_t);
+#else
+Void_t*  public_cALLOc();
+#endif
+
+/*
+  realloc(Void_t* p, size_t n)
+  Returns a pointer to a chunk of size n that contains the same data
+  as does chunk p up to the minimum of (n, p's size) bytes, or null
+  if no space is available.
+
+  The returned pointer may or may not be the same as p. The algorithm
+  prefers extending p when possible, otherwise it employs the
+  equivalent of a malloc-copy-free sequence.
+
+  If p is null, realloc is equivalent to malloc.
+
+  If space is not available, realloc returns null, errno is set (if on
+  ANSI) and p is NOT freed.
+
+  if n is for fewer bytes than already held by p, the newly unused
+  space is lopped off and freed if possible.  Unless the #define
+  REALLOC_ZERO_BYTES_FREES is set, realloc with a size argument of
+  zero (re)allocates a minimum-sized chunk.
+
+  Large chunks that were internally obtained via mmap will always
+  be reallocated using malloc-copy-free sequences unless
+  the system supports MREMAP (currently only linux).
+
+  The old unix realloc convention of allowing the last-free'd chunk
+  to be used as an argument to realloc is not supported.
+*/
+#if __STD_C
+Void_t*  public_rEALLOc(cvmx_arena_list_t arena_list, Void_t*, size_t);
+#else
+Void_t*  public_rEALLOc();
+#endif
+
+/*
+  memalign(size_t alignment, size_t n);
+  Returns a pointer to a newly allocated chunk of n bytes, aligned
+  in accord with the alignment argument.
+
+  The alignment argument should be a power of two. If the argument is
+  not a power of two, the nearest greater power is used.
+  8-byte alignment is guaranteed by normal malloc calls, so don't
+  bother calling memalign with an argument of 8 or less.
+
+  Overreliance on memalign is a sure way to fragment space.
+*/
+#if __STD_C
+Void_t*  public_mEMALIGn(cvmx_arena_list_t arena_list, size_t, size_t);
+#else
+Void_t*  public_mEMALIGn();
+#endif
+
+/*
+  valloc(size_t n);
+  Equivalent to memalign(pagesize, n), where pagesize is the page
+  size of the system. If the pagesize is unknown, 4096 is used.
+*/
+#if __STD_C
+Void_t*  public_vALLOc(size_t);
+#else
+Void_t*  public_vALLOc();
+#endif
+
+
+
+/*
+  mallopt(int parameter_number, int parameter_value)
+  Sets tunable parameters The format is to provide a
+  (parameter-number, parameter-value) pair.  mallopt then sets the
+  corresponding parameter to the argument value if it can (i.e., so
+  long as the value is meaningful), and returns 1 if successful else
+  0.  SVID/XPG/ANSI defines four standard param numbers for mallopt,
+  normally defined in malloc.h.  Only one of these (M_MXFAST) is used
+  in this malloc. The others (M_NLBLKS, M_GRAIN, M_KEEP) don't apply,
+  so setting them has no effect. But this malloc also supports four
+  other options in mallopt. See below for details.  Briefly, supported
+  parameters are as follows (listed defaults are for "typical"
+  configurations).
+
+  Symbol            param #   default    allowed param values
+  M_MXFAST          1         64         0-80  (0 disables fastbins)
+  M_TRIM_THRESHOLD -1         128*1024   any   (-1U disables trimming)
+  M_TOP_PAD        -2         0          any
+  M_MMAP_THRESHOLD -3         128*1024   any   (or 0 if no MMAP support)
+  M_MMAP_MAX       -4         65536      any   (0 disables use of mmap)
+*/
+#if __STD_C
+int      public_mALLOPt(int, int);
+#else
+int      public_mALLOPt();
+#endif
+
+
+/*
+  mallinfo()
+  Returns (by copy) a struct containing various summary statistics:
+
+  arena:     current total non-mmapped bytes allocated from system
+  ordblks:   the number of free chunks
+  smblks:    the number of fastbin blocks (i.e., small chunks that
+               have been freed but not use resused or consolidated)
+  hblks:     current number of mmapped regions
+  hblkhd:    total bytes held in mmapped regions
+  usmblks:   the maximum total allocated space. This will be greater
+                than current total if trimming has occurred.
+  fsmblks:   total bytes held in fastbin blocks
+  uordblks:  current total allocated space (normal or mmapped)
+  fordblks:  total free space
+  keepcost:  the maximum number of bytes that could ideally be released
+               back to system via malloc_trim. ("ideally" means that
+               it ignores page restrictions etc.)
+
+  Because these fields are ints, but internal bookkeeping may
+  be kept as longs, the reported values may wrap around zero and
+  thus be inaccurate.
+*/
+#if __STD_C
+struct mallinfo public_mALLINFo(void);
+#else
+struct mallinfo public_mALLINFo();
+#endif
+
+/*
+  independent_calloc(size_t n_elements, size_t element_size, Void_t* chunks[]);
+
+  independent_calloc is similar to calloc, but instead of returning a
+  single cleared space, it returns an array of pointers to n_elements
+  independent elements that can hold contents of size elem_size, each
+  of which starts out cleared, and can be independently freed,
+  realloc'ed etc. The elements are guaranteed to be adjacently
+  allocated (this is not guaranteed to occur with multiple callocs or
+  mallocs), which may also improve cache locality in some
+  applications.
+
+  The "chunks" argument is optional (i.e., may be null, which is
+  probably the most typical usage). If it is null, the returned array
+  is itself dynamically allocated and should also be freed when it is
+  no longer needed. Otherwise, the chunks array must be of at least
+  n_elements in length. It is filled in with the pointers to the
+  chunks.
+
+  In either case, independent_calloc returns this pointer array, or
+  null if the allocation failed.  If n_elements is zero and "chunks"
+  is null, it returns a chunk representing an array with zero elements
+  (which should be freed if not wanted).
+
+  Each element must be individually freed when it is no longer
+  needed. If you'd like to instead be able to free all at once, you
+  should instead use regular calloc and assign pointers into this
+  space to represent elements.  (In this case though, you cannot
+  independently free elements.)
+
+  independent_calloc simplifies and speeds up implementations of many
+  kinds of pools.  It may also be useful when constructing large data
+  structures that initially have a fixed number of fixed-sized nodes,
+  but the number is not known at compile time, and some of the nodes
+  may later need to be freed. For example:
+
+  struct Node { int item; struct Node* next; };
+
+  struct Node* build_list() {
+    struct Node** pool;
+    int n = read_number_of_nodes_needed();
+    if (n <= 0) return 0;
+    pool = (struct Node**)(independent_calloc(n, sizeof(struct Node), 0);
+    if (pool == 0) die();
+    // organize into a linked list...
+    struct Node* first = pool[0];
+    for (i = 0; i < n-1; ++i)
+      pool[i]->next = pool[i+1];
+    free(pool);     // Can now free the array (or not, if it is needed later)
+    return first;
+  }
+*/
+#if __STD_C
+Void_t** public_iCALLOc(size_t, size_t, Void_t**);
+#else
+Void_t** public_iCALLOc();
+#endif
+
+/*
+  independent_comalloc(size_t n_elements, size_t sizes[], Void_t* chunks[]);
+
+  independent_comalloc allocates, all at once, a set of n_elements
+  chunks with sizes indicated in the "sizes" array.    It returns
+  an array of pointers to these elements, each of which can be
+  independently freed, realloc'ed etc. The elements are guaranteed to
+  be adjacently allocated (this is not guaranteed to occur with
+  multiple callocs or mallocs), which may also improve cache locality
+  in some applications.
+
+  The "chunks" argument is optional (i.e., may be null). If it is null
+  the returned array is itself dynamically allocated and should also
+  be freed when it is no longer needed. Otherwise, the chunks array
+  must be of at least n_elements in length. It is filled in with the
+  pointers to the chunks.
+
+  In either case, independent_comalloc returns this pointer array, or
+  null if the allocation failed.  If n_elements is zero and chunks is
+  null, it returns a chunk representing an array with zero elements
+  (which should be freed if not wanted).
+
+  Each element must be individually freed when it is no longer
+  needed. If you'd like to instead be able to free all at once, you
+  should instead use a single regular malloc, and assign pointers at
+  particular offsets in the aggregate space. (In this case though, you
+  cannot independently free elements.)
+
+  independent_comallac differs from independent_calloc in that each
+  element may have a different size, and also that it does not
+  automatically clear elements.
+
+  independent_comalloc can be used to speed up allocation in cases
+  where several structs or objects must always be allocated at the
+  same time.  For example:
+
+  struct Head { ... }
+  struct Foot { ... }
+
+  void send_message(char* msg) {
+    int msglen = strlen(msg);
+    size_t sizes[3] = { sizeof(struct Head), msglen, sizeof(struct Foot) };
+    void* chunks[3];
+    if (independent_comalloc(3, sizes, chunks) == 0)
+      die();
+    struct Head* head = (struct Head*)(chunks[0]);
+    char*        body = (char*)(chunks[1]);
+    struct Foot* foot = (struct Foot*)(chunks[2]);
+    // ...
+  }
+
+  In general though, independent_comalloc is worth using only for
+  larger values of n_elements. For small values, you probably won't
+  detect enough difference from series of malloc calls to bother.
+
+  Overuse of independent_comalloc can increase overall memory usage,
+  since it cannot reuse existing noncontiguous small chunks that
+  might be available for some of the elements.
+*/
+#if __STD_C
+Void_t** public_iCOMALLOc(size_t, size_t*, Void_t**);
+#else
+Void_t** public_iCOMALLOc();
+#endif
+
+
+/*
+  pvalloc(size_t n);
+  Equivalent to valloc(minimum-page-that-holds(n)), that is,
+  round up n to nearest pagesize.
+ */
+#if __STD_C
+Void_t*  public_pVALLOc(size_t);
+#else
+Void_t*  public_pVALLOc();
+#endif
+
+/*
+  cfree(Void_t* p);
+  Equivalent to free(p).
+
+  cfree is needed/defined on some systems that pair it with calloc,
+  for odd historical reasons (such as: cfree is used in example
+  code in the first edition of K&R).
+*/
+#if __STD_C
+void     public_cFREe(Void_t*);
+#else
+void     public_cFREe();
+#endif
+
+/*
+  malloc_trim(size_t pad);
+
+  If possible, gives memory back to the system (via negative
+  arguments to sbrk) if there is unused memory at the `high' end of
+  the malloc pool. You can call this after freeing large blocks of
+  memory to potentially reduce the system-level memory requirements
+  of a program. However, it cannot guarantee to reduce memory. Under
+  some allocation patterns, some large free blocks of memory will be
+  locked between two used chunks, so they cannot be given back to
+  the system.
+
+  The `pad' argument to malloc_trim represents the amount of free
+  trailing space to leave untrimmed. If this argument is zero,
+  only the minimum amount of memory to maintain internal data
+  structures will be left (one page or less). Non-zero arguments
+  can be supplied to maintain enough trailing space to service
+  future expected allocations without having to re-obtain memory
+  from the system.
+
+  Malloc_trim returns 1 if it actually released any memory, else 0.
+  On systems that do not support "negative sbrks", it will always
+  rreturn 0.
+*/
+#if __STD_C
+int      public_mTRIm(size_t);
+#else
+int      public_mTRIm();
+#endif
+
+/*
+  malloc_usable_size(Void_t* p);
+
+  Returns the number of bytes you can actually use in
+  an allocated chunk, which may be more than you requested (although
+  often not) due to alignment and minimum size constraints.
+  You can use this many bytes without worrying about
+  overwriting other allocated objects. This is not a particularly great
+  programming practice. malloc_usable_size can be more useful in
+  debugging and assertions, for example:
+
+  p = malloc(n);
+  assert(malloc_usable_size(p) >= 256);
+
+*/
+#if __STD_C
+size_t   public_mUSABLe(Void_t*);
+#else
+size_t   public_mUSABLe();
+#endif
+
+/*
+  malloc_stats();
+  Prints on stderr the amount of space obtained from the system (both
+  via sbrk and mmap), the maximum amount (which may be more than
+  current if malloc_trim and/or munmap got called), and the current
+  number of bytes allocated via malloc (or realloc, etc) but not yet
+  freed. Note that this is the number of bytes allocated, not the
+  number requested. It will be larger than the number requested
+  because of alignment and bookkeeping overhead. Because it includes
+  alignment wastage as being in use, this figure may be greater than
+  zero even when no user-level chunks are allocated.
+
+  The reported current and maximum system memory can be inaccurate if
+  a program makes other calls to system memory allocation functions
+  (normally sbrk) outside of malloc.
+
+  malloc_stats prints only the most commonly interesting statistics.
+  More information can be obtained by calling mallinfo.
+
+*/
+#if __STD_C
+void     public_mSTATs(void);
+#else
+void     public_mSTATs();
+#endif
+
+/*
+  malloc_get_state(void);
+
+  Returns the state of all malloc variables in an opaque data
+  structure.
+*/
+#if __STD_C
+Void_t*  public_gET_STATe(void);
+#else
+Void_t*  public_gET_STATe();
+#endif
+
+/*
+  malloc_set_state(Void_t* state);
+
+  Restore the state of all malloc variables from data obtained with
+  malloc_get_state().
+*/
+#if __STD_C
+int      public_sET_STATe(Void_t*);
+#else
+int      public_sET_STATe();
+#endif
+
+#ifdef _LIBC
+/*
+  posix_memalign(void **memptr, size_t alignment, size_t size);
+
+  POSIX wrapper like memalign(), checking for validity of size.
+*/
+int      __posix_memalign(void **, size_t, size_t);
+#endif
+
+/* mallopt tuning options */
+
+/*
+  M_MXFAST is the maximum request size used for "fastbins", special bins
+  that hold returned chunks without consolidating their spaces. This
+  enables future requests for chunks of the same size to be handled
+  very quickly, but can increase fragmentation, and thus increase the
+  overall memory footprint of a program.
+
+  This malloc manages fastbins very conservatively yet still
+  efficiently, so fragmentation is rarely a problem for values less
+  than or equal to the default.  The maximum supported value of MXFAST
+  is 80. You wouldn't want it any higher than this anyway.  Fastbins
+  are designed especially for use with many small structs, objects or
+  strings -- the default handles structs/objects/arrays with sizes up
+  to 8 4byte fields, or small strings representing words, tokens,
+  etc. Using fastbins for larger objects normally worsens
+  fragmentation without improving speed.
+
+  M_MXFAST is set in REQUEST size units. It is internally used in
+  chunksize units, which adds padding and alignment.  You can reduce
+  M_MXFAST to 0 to disable all use of fastbins.  This causes the malloc
+  algorithm to be a closer approximation of fifo-best-fit in all cases,
+  not just for larger requests, but will generally cause it to be
+  slower.
+*/
+
+
+/* M_MXFAST is a standard SVID/XPG tuning option, usually listed in malloc.h */
+#ifndef M_MXFAST
+#define M_MXFAST            1
+#endif
+
+#ifndef DEFAULT_MXFAST
+#define DEFAULT_MXFAST     64
+#endif
+
+
+/*
+  M_TRIM_THRESHOLD is the maximum amount of unused top-most memory
+  to keep before releasing via malloc_trim in free().
+
+  Automatic trimming is mainly useful in long-lived programs.
+  Because trimming via sbrk can be slow on some systems, and can
+  sometimes be wasteful (in cases where programs immediately
+  afterward allocate more large chunks) the value should be high
+  enough so that your overall system performance would improve by
+  releasing this much memory.
+
+  The trim threshold and the mmap control parameters (see below)
+  can be traded off with one another. Trimming and mmapping are
+  two different ways of releasing unused memory back to the
+  system. Between these two, it is often possible to keep
+  system-level demands of a long-lived program down to a bare
+  minimum. For example, in one test suite of sessions measuring
+  the XF86 X server on Linux, using a trim threshold of 128K and a
+  mmap threshold of 192K led to near-minimal long term resource
+  consumption.
+
+  If you are using this malloc in a long-lived program, it should
+  pay to experiment with these values.  As a rough guide, you
+  might set to a value close to the average size of a process
+  (program) running on your system.  Releasing this much memory
+  would allow such a process to run in memory.  Generally, it's
+  worth it to tune for trimming rather tham memory mapping when a
+  program undergoes phases where several large chunks are
+  allocated and released in ways that can reuse each other's
+  storage, perhaps mixed with phases where there are no such
+  chunks at all.  And in well-behaved long-lived programs,
+  controlling release of large blocks via trimming versus mapping
+  is usually faster.
+
+  However, in most programs, these parameters serve mainly as
+  protection against the system-level effects of carrying around
+  massive amounts of unneeded memory. Since frequent calls to
+  sbrk, mmap, and munmap otherwise degrade performance, the default
+  parameters are set to relatively high values that serve only as
+  safeguards.
+
+  The trim value It must be greater than page size to have any useful
+  effect.  To disable trimming completely, you can set to
+  (unsigned long)(-1)
+
+  Trim settings interact with fastbin (MXFAST) settings: Unless
+  TRIM_FASTBINS is defined, automatic trimming never takes place upon
+  freeing a chunk with size less than or equal to MXFAST. Trimming is
+  instead delayed until subsequent freeing of larger chunks. However,
+  you can still force an attempted trim by calling malloc_trim.
+
+  Also, trimming is not generally possible in cases where
+  the main arena is obtained via mmap.
+
+  Note that the trick some people use of mallocing a huge space and
+  then freeing it at program startup, in an attempt to reserve system
+  memory, doesn't have the intended effect under automatic trimming,
+  since that memory will immediately be returned to the system.
+*/
+
+#define M_TRIM_THRESHOLD       -1
+
+#ifndef DEFAULT_TRIM_THRESHOLD
+#define DEFAULT_TRIM_THRESHOLD (128 * 1024)
+#endif
+
+/*
+  M_TOP_PAD is the amount of extra `padding' space to allocate or
+  retain whenever sbrk is called. It is used in two ways internally:
+
+  * When sbrk is called to extend the top of the arena to satisfy
+  a new malloc request, this much padding is added to the sbrk
+  request.
+
+  * When malloc_trim is called automatically from free(),
+  it is used as the `pad' argument.
+
+  In both cases, the actual amount of padding is rounded
+  so that the end of the arena is always a system page boundary.
+
+  The main reason for using padding is to avoid calling sbrk so
+  often. Having even a small pad greatly reduces the likelihood
+  that nearly every malloc request during program start-up (or
+  after trimming) will invoke sbrk, which needlessly wastes
+  time.
+
+  Automatic rounding-up to page-size units is normally sufficient
+  to avoid measurable overhead, so the default is 0.  However, in
+  systems where sbrk is relatively slow, it can pay to increase
+  this value, at the expense of carrying around more memory than
+  the program needs.
+*/
+
+#define M_TOP_PAD              -2
+
+#ifndef DEFAULT_TOP_PAD
+#define DEFAULT_TOP_PAD        (0)
+#endif
+
+/*
+  M_MMAP_THRESHOLD is the request size threshold for using mmap()
+  to service a request. Requests of at least this size that cannot
+  be allocated using already-existing space will be serviced via mmap.
+  (If enough normal freed space already exists it is used instead.)
+
+  Using mmap segregates relatively large chunks of memory so that
+  they can be individually obtained and released from the host
+  system. A request serviced through mmap is never reused by any
+  other request (at least not directly; the system may just so
+  happen to remap successive requests to the same locations).
+
+  Segregating space in this way has the benefits that:
+
+   1. Mmapped space can ALWAYS be individually released back
+      to the system, which helps keep the system level memory
+      demands of a long-lived program low.
+   2. Mapped memory can never become `locked' between
+      other chunks, as can happen with normally allocated chunks, which
+      means that even trimming via malloc_trim would not release them.
+   3. On some systems with "holes" in address spaces, mmap can obtain
+      memory that sbrk cannot.
+
+  However, it has the disadvantages that:
+
+   1. The space cannot be reclaimed, consolidated, and then
+      used to service later requests, as happens with normal chunks.
+   2. It can lead to more wastage because of mmap page alignment
+      requirements
+   3. It causes malloc performance to be more dependent on host
+      system memory management support routines which may vary in
+      implementation quality and may impose arbitrary
+      limitations. Generally, servicing a request via normal
+      malloc steps is faster than going through a system's mmap.
+
+  The advantages of mmap nearly always outweigh disadvantages for
+  "large" chunks, but the value of "large" varies across systems.  The
+  default is an empirically derived value that works well in most
+  systems.
+*/
+
+#define M_MMAP_THRESHOLD      -3
+
+#ifndef DEFAULT_MMAP_THRESHOLD
+#define DEFAULT_MMAP_THRESHOLD (128 * 1024)
+#endif
+
+/*
+  M_MMAP_MAX is the maximum number of requests to simultaneously
+  service using mmap. This parameter exists because
+  some systems have a limited number of internal tables for
+  use by mmap, and using more than a few of them may degrade
+  performance.
+
+  The default is set to a value that serves only as a safeguard.
+  Setting to 0 disables use of mmap for servicing large requests.  If
+  HAVE_MMAP is not set, the default value is 0, and attempts to set it
+  to non-zero values in mallopt will fail.
+*/
+
+#define M_MMAP_MAX             -4
+
+#ifndef DEFAULT_MMAP_MAX
+#if HAVE_MMAP
+#define DEFAULT_MMAP_MAX       (65536)
+#else
+#define DEFAULT_MMAP_MAX       (0)
+#endif
+#endif
+
+#ifdef __cplusplus
+};  /* end of extern "C" */
+#endif
+
+#include <cvmx-spinlock.h>
+#include "malloc.h"
+#include "thread-m.h"
+
+#ifdef DEBUG_PRINTS
+#define debug_printf    printf
+#else
+#define debug_printf(format, args...)  
+#endif
+
+#ifndef BOUNDED_N
+#define BOUNDED_N(ptr, sz) (ptr)
+#endif
+#ifndef RETURN_ADDRESS
+#define RETURN_ADDRESS(X_) (NULL)
+#endif
+
+/* On some platforms we can compile internal, not exported functions better.
+   Let the environment provide a macro and define it to be empty if it
+   is not available.  */
+#ifndef internal_function
+# define internal_function
+#endif
+
+/* Forward declarations.  */
+struct malloc_chunk;
+typedef struct malloc_chunk* mchunkptr;
+
+/* Internal routines.  */
+
+#if __STD_C
+
+static Void_t*         _int_malloc(mstate, size_t);
+static void            _int_free(mstate, Void_t*);
+static Void_t*         _int_realloc(mstate, Void_t*, size_t);
+static Void_t*         _int_memalign(mstate, size_t, size_t);
+static Void_t*         _int_valloc(mstate, size_t);
+static Void_t*  _int_pvalloc(mstate, size_t);
+static Void_t*  cALLOc(cvmx_arena_list_t arena_list, size_t, size_t);
+static Void_t** _int_icalloc(mstate, size_t, size_t, Void_t**);
+static Void_t** _int_icomalloc(mstate, size_t, size_t*, Void_t**);
+static int      mTRIm(size_t);
+static size_t   mUSABLe(Void_t*);
+static void     mSTATs(void);
+static int      mALLOPt(int, int);
+static struct mallinfo mALLINFo(mstate);
+
+static Void_t* internal_function mem2mem_check(Void_t *p, size_t sz);
+static int internal_function top_check(void);
+static void internal_function munmap_chunk(mchunkptr p);
+#if HAVE_MREMAP
+static mchunkptr internal_function mremap_chunk(mchunkptr p, size_t new_size);
+#endif
+
+static Void_t*   malloc_check(size_t sz, const Void_t *caller);
+static void      free_check(Void_t* mem, const Void_t *caller);
+static Void_t*   realloc_check(Void_t* oldmem, size_t bytes,
+			       const Void_t *caller);
+static Void_t*   memalign_check(size_t alignment, size_t bytes,
+				const Void_t *caller);
+#ifndef NO_THREADS
+static Void_t*   malloc_starter(size_t sz, const Void_t *caller);
+static void      free_starter(Void_t* mem, const Void_t *caller);
+static Void_t*   malloc_atfork(size_t sz, const Void_t *caller);
+static void      free_atfork(Void_t* mem, const Void_t *caller);
+#endif
+
+#else
+
+Void_t*         _int_malloc();
+void            _int_free();
+Void_t*         _int_realloc();
+Void_t*         _int_memalign();
+Void_t*         _int_valloc();
+Void_t*         _int_pvalloc();
+/*static Void_t*  cALLOc();*/
+static Void_t** _int_icalloc();
+static Void_t** _int_icomalloc();
+static int      mTRIm();
+static size_t   mUSABLe();
+static void     mSTATs();
+static int      mALLOPt();
+static struct mallinfo mALLINFo();
+
+#endif
+
+
+
+
+/* ------------- Optional versions of memcopy ---------------- */
+
+
+#if USE_MEMCPY
+
+/*
+  Note: memcpy is ONLY invoked with non-overlapping regions,
+  so the (usually slower) memmove is not needed.
+*/
+
+#define MALLOC_COPY(dest, src, nbytes)  memcpy(dest, src, nbytes)
+#define MALLOC_ZERO(dest, nbytes)       memset(dest, 0,   nbytes)
+
+#else /* !USE_MEMCPY */
+
+/* Use Duff's device for good zeroing/copying performance. */
+
+#define MALLOC_ZERO(charp, nbytes)                                            \
+do {                                                                          \
+  INTERNAL_SIZE_T* mzp = (INTERNAL_SIZE_T*)(charp);                           \
+  unsigned long mctmp = (nbytes)/sizeof(INTERNAL_SIZE_T);                     \
+  long mcn;                                                                   \
+  if (mctmp < 8) mcn = 0; else { mcn = (mctmp-1)/8; mctmp %= 8; }             \
+  switch (mctmp) {                                                            \
+    case 0: for(;;) { *mzp++ = 0;                                             \
+    case 7:           *mzp++ = 0;                                             \
+    case 6:           *mzp++ = 0;                                             \
+    case 5:           *mzp++ = 0;                                             \
+    case 4:           *mzp++ = 0;                                             \
+    case 3:           *mzp++ = 0;                                             \
+    case 2:           *mzp++ = 0;                                             \
+    case 1:           *mzp++ = 0; if(mcn <= 0) break; mcn--; }                \
+  }                                                                           \
+} while(0)
+
+#define MALLOC_COPY(dest,src,nbytes)                                          \
+do {                                                                          \
+  INTERNAL_SIZE_T* mcsrc = (INTERNAL_SIZE_T*) src;                            \
+  INTERNAL_SIZE_T* mcdst = (INTERNAL_SIZE_T*) dest;                           \
+  unsigned long mctmp = (nbytes)/sizeof(INTERNAL_SIZE_T);                     \
+  long mcn;                                                                   \
+  if (mctmp < 8) mcn = 0; else { mcn = (mctmp-1)/8; mctmp %= 8; }             \
+  switch (mctmp) {                                                            \
+    case 0: for(;;) { *mcdst++ = *mcsrc++;                                    \
+    case 7:           *mcdst++ = *mcsrc++;                                    \
+    case 6:           *mcdst++ = *mcsrc++;                                    \
+    case 5:           *mcdst++ = *mcsrc++;                                    \
+    case 4:           *mcdst++ = *mcsrc++;                                    \
+    case 3:           *mcdst++ = *mcsrc++;                                    \
+    case 2:           *mcdst++ = *mcsrc++;                                    \
+    case 1:           *mcdst++ = *mcsrc++; if(mcn <= 0) break; mcn--; }       \
+  }                                                                           \
+} while(0)
+
+#endif
+
+/* ------------------ MMAP support ------------------  */
+
+
+#if HAVE_MMAP
+
+#include <fcntl.h>
+#ifndef LACKS_SYS_MMAN_H
+#include <sys/mman.h>
+#endif
+
+#if !defined(MAP_ANONYMOUS) && defined(MAP_ANON)
+# define MAP_ANONYMOUS MAP_ANON
+#endif
+#if !defined(MAP_FAILED)
+# define MAP_FAILED ((char*)-1)
+#endif
+
+#ifndef MAP_NORESERVE
+# ifdef MAP_AUTORESRV
+#  define MAP_NORESERVE MAP_AUTORESRV
+# else
+#  define MAP_NORESERVE 0
+# endif
+#endif
+
+/*
+   Nearly all versions of mmap support MAP_ANONYMOUS,
+   so the following is unlikely to be needed, but is
+   supplied just in case.
+*/
+
+#ifndef MAP_ANONYMOUS
+
+static int dev_zero_fd = -1; /* Cached file descriptor for /dev/zero. */
+
+#define MMAP(addr, size, prot, flags) ((dev_zero_fd < 0) ? \
+ (dev_zero_fd = open("/dev/zero", O_RDWR), \
+  mmap((addr), (size), (prot), (flags), dev_zero_fd, 0)) : \
+   mmap((addr), (size), (prot), (flags), dev_zero_fd, 0))
+
+#else
+
+#define MMAP(addr, size, prot, flags) \
+ (mmap((addr), (size), (prot), (flags)|MAP_ANONYMOUS, -1, 0))
+
+#endif
+
+
+#endif /* HAVE_MMAP */
+
+
+/*
+  -----------------------  Chunk representations -----------------------
+*/
+
+
+/*
+  This struct declaration is misleading (but accurate and necessary).
+  It declares a "view" into memory allowing access to necessary
+  fields at known offsets from a given base. See explanation below.
+*/
+struct malloc_chunk {
+
+  INTERNAL_SIZE_T      prev_size;  /* Size of previous chunk (if free).  */
+  INTERNAL_SIZE_T      size;       /* Size in bytes, including overhead. */
+  mstate               arena_ptr;  /* ptr to arena chunk belongs to */
+
+  struct malloc_chunk* fd;         /* double links -- used only if free. */
+  struct malloc_chunk* bk;
+};
+
+
+/*
+   malloc_chunk details:
+
+    (The following includes lightly edited explanations by Colin Plumb.)
+
+    Chunks of memory are maintained using a `boundary tag' method as
+    described in e.g., Knuth or Standish.  (See the paper by Paul
+    Wilson ftp://ftp.cs.utexas.edu/pub/garbage/allocsrv.ps for a
+    survey of such techniques.)  Sizes of free chunks are stored both
+    in the front of each chunk and at the end.  This makes
+    consolidating fragmented chunks into bigger chunks very fast.  The
+    size fields also hold bits representing whether chunks are free or
+    in use.
+
+    An allocated chunk looks like this:
+
+
+    chunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+            |             Size of previous chunk, if allocated            | |
+            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+            |             Size of chunk, in bytes                         |P|
+      mem-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+            |             User data starts here...                          .
+            .                                                               .
+            .             (malloc_usable_space() bytes)                     .
+            .                                                               |
+nextchunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+            |             Size of chunk                                     |
+            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+
+    Where "chunk" is the front of the chunk for the purpose of most of
+    the malloc code, but "mem" is the pointer that is returned to the
+    user.  "Nextchunk" is the beginning of the next contiguous chunk.
+
+    Chunks always begin on even word boundries, so the mem portion
+    (which is returned to the user) is also on an even word boundary, and
+    thus at least double-word aligned.
+
+    Free chunks are stored in circular doubly-linked lists, and look like this:
+
+    chunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+            |             Size of previous chunk                            |
+            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    `head:' |             Size of chunk, in bytes                         |P|
+      mem-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+            |             Forward pointer to next chunk in list             |
+            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+            |             Back pointer to previous chunk in list            |
+            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+            |             Unused space (may be 0 bytes long)                .
+            .                                                               .
+            .                                                               |
+nextchunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    `foot:' |             Size of chunk, in bytes                           |
+            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+    The P (PREV_INUSE) bit, stored in the unused low-order bit of the
+    chunk size (which is always a multiple of two words), is an in-use
+    bit for the *previous* chunk.  If that bit is *clear*, then the
+    word before the current chunk size contains the previous chunk
+    size, and can be used to find the front of the previous chunk.
+    The very first chunk allocated always has this bit set,
+    preventing access to non-existent (or non-owned) memory. If
+    prev_inuse is set for any given chunk, then you CANNOT determine
+    the size of the previous chunk, and might even get a memory
+    addressing fault when trying to do so.
+
+    Note that the `foot' of the current chunk is actually represented
+    as the prev_size of the NEXT chunk. This makes it easier to
+    deal with alignments etc but can be very confusing when trying
+    to extend or adapt this code.
+
+    The two exceptions to all this are
+
+     1. The special chunk `top' doesn't bother using the
+        trailing size field since there is no next contiguous chunk
+        that would have to index off it. After initialization, `top'
+        is forced to always exist.  If it would become less than
+        MINSIZE bytes long, it is replenished.
+
+     2. Chunks allocated via mmap, which have the second-lowest-order
+        bit (IS_MMAPPED) set in their size fields.  Because they are
+        allocated one-by-one, each must contain its own trailing size field.
+
+*/
+
+/*
+  ---------- Size and alignment checks and conversions ----------
+*/
+
+/* conversion from malloc headers to user pointers, and back */
+/* Added size for pointer to make room for arena_ptr */
+#define chunk2mem(p)   ((Void_t*)((char*)(p) + 2*SIZE_SZ + sizeof(void *)))
+#define mem2chunk(mem) ((mchunkptr)((char*)(mem) - 2*SIZE_SZ - sizeof(void *)))
+
+/* The smallest possible chunk */
+#define MIN_CHUNK_SIZE        (sizeof(struct malloc_chunk))
+
+/* The smallest size we can malloc is an aligned minimal chunk */
+
+#define MINSIZE  \
+  (unsigned long)(((MIN_CHUNK_SIZE+MALLOC_ALIGN_MASK) & ~MALLOC_ALIGN_MASK))
+
+/* Check if m has acceptable alignment */
+
+#define aligned_OK(m)  (((unsigned long)((m)) & (MALLOC_ALIGN_MASK)) == 0)
+
+
+/*
+   Check if a request is so large that it would wrap around zero when
+   padded and aligned. To simplify some other code, the bound is made
+   low enough so that adding MINSIZE will also not wrap around zero.
+*/
+
+#define REQUEST_OUT_OF_RANGE(req)                                 \
+  ((unsigned long)(req) >=                                        \
+   (unsigned long)(INTERNAL_SIZE_T)(-2 * MINSIZE))
+
+/* pad request bytes into a usable size -- internal version */
+
+
+/* prev_size field of next chunk is overwritten with data
+** when in use.  NOTE - last SIZE_SZ of arena must be left
+** unused for last chunk to use
+*/
+/* Added sizeof(void *) to make room for arena_ptr */
+#define request2size(req)                                         \
+  (((req) + sizeof(void *) + SIZE_SZ + MALLOC_ALIGN_MASK < MINSIZE)  ?             \
+   MINSIZE :                                                      \
+   ((req) + sizeof(void *) + SIZE_SZ + MALLOC_ALIGN_MASK) & ~MALLOC_ALIGN_MASK)
+
+/*  Same, except also perform argument check */
+
+#define checked_request2size(req, sz)                             \
+  if (REQUEST_OUT_OF_RANGE(req)) {                                \
+    MALLOC_FAILURE_ACTION;                                        \
+    return 0;                                                     \
+  }                                                               \
+  (sz) = request2size(req);
+
+/*
+  --------------- Physical chunk operations ---------------
+*/
+
+
+/* size field is or'ed with PREV_INUSE when previous adjacent chunk in use */
+#define PREV_INUSE 0x1
+
+/* extract inuse bit of previous chunk */
+#define prev_inuse(p)       ((p)->size & PREV_INUSE)
+
+
+/* size field is or'ed with IS_MMAPPED if the chunk was obtained with mmap() */
+#define IS_MMAPPED 0x2
+
+/* check for mmap()'ed chunk */
+#define chunk_is_mmapped(p) ((p)->size & IS_MMAPPED)
+
+
+
+/*
+  Bits to mask off when extracting size
+
+  Note: IS_MMAPPED is intentionally not masked off from size field in
+  macros for which mmapped chunks should never be seen. This should
+  cause helpful core dumps to occur if it is tried by accident by
+  people extending or adapting this malloc.
+*/
+#define SIZE_BITS (PREV_INUSE|IS_MMAPPED)
+
+/* Get size, ignoring use bits */
+#define chunksize(p)         ((p)->size & ~(SIZE_BITS))
+
+
+/* Ptr to next physical malloc_chunk. */
+#define next_chunk(p) ((mchunkptr)( ((char*)(p)) + ((p)->size & ~SIZE_BITS) ))
+
+/* Ptr to previous physical malloc_chunk */
+#define prev_chunk(p) ((mchunkptr)( ((char*)(p)) - ((p)->prev_size) ))
+
+/* Treat space at ptr + offset as a chunk */
+#define chunk_at_offset(p, s)  ((mchunkptr)(((char*)(p)) + (s)))
+
+/* extract p's inuse bit */
+#define inuse(p)\
+((((mchunkptr)(((char*)(p))+((p)->size & ~SIZE_BITS)))->size) & PREV_INUSE)
+
+/* set/clear chunk as being inuse without otherwise disturbing */
+#define set_inuse(p)\
+((mchunkptr)(((char*)(p)) + ((p)->size & ~SIZE_BITS)))->size |= PREV_INUSE
+
+#define clear_inuse(p)\
+((mchunkptr)(((char*)(p)) + ((p)->size & ~SIZE_BITS)))->size &= ~(PREV_INUSE)
+
+
+/* check/set/clear inuse bits in known places */
+#define inuse_bit_at_offset(p, s)\
+ (((mchunkptr)(((char*)(p)) + (s)))->size & PREV_INUSE)
+
+#define set_inuse_bit_at_offset(p, s)\
+ (((mchunkptr)(((char*)(p)) + (s)))->size |= PREV_INUSE)
+
+#define clear_inuse_bit_at_offset(p, s)\
+ (((mchunkptr)(((char*)(p)) + (s)))->size &= ~(PREV_INUSE))
+
+
+/* Set size at head, without disturbing its use bit */
+#define set_head_size(p, s)  ((p)->size = (((p)->size & SIZE_BITS) | (s)))
+
+/* Set size/use field */
+#define set_head(p, s)       ((p)->size = (s))
+
+/* Set size at footer (only when chunk is not in use) */
+#define set_foot(p, s)       (((mchunkptr)((char*)(p) + (s)))->prev_size = (s))
+
+
+/*
+  -------------------- Internal data structures --------------------
+
+   All internal state is held in an instance of malloc_state defined
+   below. There are no other static variables, except in two optional
+   cases:
+   * If USE_MALLOC_LOCK is defined, the mALLOC_MUTEx declared above.
+   * If HAVE_MMAP is true, but mmap doesn't support
+     MAP_ANONYMOUS, a dummy file descriptor for mmap.
+
+   Beware of lots of tricks that minimize the total bookkeeping space
+   requirements. The result is a little over 1K bytes (for 4byte
+   pointers and size_t.)
+*/
+
+/*
+  Bins
+
+    An array of bin headers for free chunks. Each bin is doubly
+    linked.  The bins are approximately proportionally (log) spaced.
+    There are a lot of these bins (128). This may look excessive, but
+    works very well in practice.  Most bins hold sizes that are
+    unusual as malloc request sizes, but are more usual for fragments
+    and consolidated sets of chunks, which is what these bins hold, so
+    they can be found quickly.  All procedures maintain the invariant
+    that no consolidated chunk physically borders another one, so each
+    chunk in a list is known to be preceeded and followed by either
+    inuse chunks or the ends of memory.
+
+    Chunks in bins are kept in size order, with ties going to the
+    approximately least recently used chunk. Ordering isn't needed
+    for the small bins, which all contain the same-sized chunks, but
+    facilitates best-fit allocation for larger chunks. These lists
+    are just sequential. Keeping them in order almost never requires
+    enough traversal to warrant using fancier ordered data
+    structures.
+
+    Chunks of the same size are linked with the most
+    recently freed at the front, and allocations are taken from the
+    back.  This results in LRU (FIFO) allocation order, which tends
+    to give each chunk an equal opportunity to be consolidated with
+    adjacent freed chunks, resulting in larger free chunks and less
+    fragmentation.
+
+    To simplify use in double-linked lists, each bin header acts
+    as a malloc_chunk. This avoids special-casing for headers.
+    But to conserve space and improve locality, we allocate
+    only the fd/bk pointers of bins, and then use repositioning tricks
+    to treat these as the fields of a malloc_chunk*.
+*/
+
+typedef struct malloc_chunk* mbinptr;
+
+/* addressing -- note that bin_at(0) does not exist */
+#define bin_at(m, i) ((mbinptr)((char*)&((m)->bins[(i)<<1]) - (SIZE_SZ<<1)))
+
+/* analog of ++bin */
+#define next_bin(b)  ((mbinptr)((char*)(b) + (sizeof(mchunkptr)<<1)))
+
+/* Reminders about list directionality within bins */
+#define first(b)     ((b)->fd)
+#define last(b)      ((b)->bk)
+
+/* Take a chunk off a bin list */
+#define unlink(P, BK, FD) {                                            \
+  FD = P->fd;                                                          \
+  BK = P->bk;                                                          \
+  FD->bk = BK;                                                         \
+  BK->fd = FD;                                                         \
+}
+
+/*
+  Indexing
+
+    Bins for sizes < 512 bytes contain chunks of all the same size, spaced
+    8 bytes apart. Larger bins are approximately logarithmically spaced:
+
+    64 bins of size       8
+    32 bins of size      64
+    16 bins of size     512
+     8 bins of size    4096
+     4 bins of size   32768
+     2 bins of size  262144
+     1 bin  of size what's left
+
+    There is actually a little bit of slop in the numbers in bin_index
+    for the sake of speed. This makes no difference elsewhere.
+
+    The bins top out around 1MB because we expect to service large
+    requests via mmap.
+*/
+
+#define NBINS             128
+#define NSMALLBINS         64
+#define SMALLBIN_WIDTH      8
+#define MIN_LARGE_SIZE    512
+
+#define in_smallbin_range(sz)  \
+  ((unsigned long)(sz) < (unsigned long)MIN_LARGE_SIZE)
+
+#define smallbin_index(sz)     (((unsigned)(sz)) >> 3)
+
+#define largebin_index(sz)                                                   \
+(((((unsigned long)(sz)) >>  6) <= 32)?  56 + (((unsigned long)(sz)) >>  6): \
+ ((((unsigned long)(sz)) >>  9) <= 20)?  91 + (((unsigned long)(sz)) >>  9): \
+ ((((unsigned long)(sz)) >> 12) <= 10)? 110 + (((unsigned long)(sz)) >> 12): \
+ ((((unsigned long)(sz)) >> 15) <=  4)? 119 + (((unsigned long)(sz)) >> 15): \
+ ((((unsigned long)(sz)) >> 18) <=  2)? 124 + (((unsigned long)(sz)) >> 18): \
+                                        126)
+
+#define bin_index(sz) \
+ ((in_smallbin_range(sz)) ? smallbin_index(sz) : largebin_index(sz))
+
+/*
+  FIRST_SORTED_BIN_SIZE is the chunk size corresponding to the
+  first bin that is maintained in sorted order. This must
+  be the smallest size corresponding to a given bin.
+
+  Normally, this should be MIN_LARGE_SIZE. But you can weaken
+  best fit guarantees to sometimes speed up malloc by increasing value.
+  Doing this means that malloc may choose a chunk that is 
+  non-best-fitting by up to the width of the bin.
+
+  Some useful cutoff values:
+      512 - all bins sorted
+     2560 - leaves bins <=     64 bytes wide unsorted  
+    12288 - leaves bins <=    512 bytes wide unsorted
+    65536 - leaves bins <=   4096 bytes wide unsorted
+   262144 - leaves bins <=  32768 bytes wide unsorted
+       -1 - no bins sorted (not recommended!)
+*/
+
+#define FIRST_SORTED_BIN_SIZE MIN_LARGE_SIZE 
+/* #define FIRST_SORTED_BIN_SIZE 65536 */
+
+/*
+  Unsorted chunks
+
+    All remainders from chunk splits, as well as all returned chunks,
+    are first placed in the "unsorted" bin. They are then placed
+    in regular bins after malloc gives them ONE chance to be used before
+    binning. So, basically, the unsorted_chunks list acts as a queue,
+    with chunks being placed on it in free (and malloc_consolidate),
+    and taken off (to be either used or placed in bins) in malloc.
+
+    The NON_MAIN_ARENA flag is never set for unsorted chunks, so it
+    does not have to be taken into account in size comparisons.
+*/
+
+/* The otherwise unindexable 1-bin is used to hold unsorted chunks. */
+#define unsorted_chunks(M)          (bin_at(M, 1))
+
+/*
+  Top
+
+    The top-most available chunk (i.e., the one bordering the end of
+    available memory) is treated specially. It is never included in
+    any bin, is used only if no other chunk is available, and is
+    released back to the system if it is very large (see
+    M_TRIM_THRESHOLD).  Because top initially
+    points to its own bin with initial zero size, thus forcing
+    extension on the first malloc request, we avoid having any special
+    code in malloc to check whether it even exists yet. But we still
+    need to do so when getting memory from system, so we make
+    initial_top treat the bin as a legal but unusable chunk during the
+    interval between initialization and the first call to
+    sYSMALLOc. (This is somewhat delicate, since it relies on
+    the 2 preceding words to be zero during this interval as well.)
+*/
+
+/* Conveniently, the unsorted bin can be used as dummy top on first call */
+#define initial_top(M)              (unsorted_chunks(M))
+
+/*
+  Binmap
+
+    To help compensate for the large number of bins, a one-level index
+    structure is used for bin-by-bin searching.  `binmap' is a
+    bitvector recording whether bins are definitely empty so they can
+    be skipped over during during traversals.  The bits are NOT always
+    cleared as soon as bins are empty, but instead only
+    when they are noticed to be empty during traversal in malloc.
+*/
+
+/* Conservatively use 32 bits per map word, even if on 64bit system */
+#define BINMAPSHIFT      5
+#define BITSPERMAP       (1U << BINMAPSHIFT)
+#define BINMAPSIZE       (NBINS / BITSPERMAP)
+
+#define idx2block(i)     ((i) >> BINMAPSHIFT)
+#define idx2bit(i)       ((1U << ((i) & ((1U << BINMAPSHIFT)-1))))
+
+#define mark_bin(m,i)    ((m)->binmap[idx2block(i)] |=  idx2bit(i))
+#define unmark_bin(m,i)  ((m)->binmap[idx2block(i)] &= ~(idx2bit(i)))
+#define get_binmap(m,i)  ((m)->binmap[idx2block(i)] &   idx2bit(i))
+
+/*
+  Fastbins
+
+    An array of lists holding recently freed small chunks.  Fastbins
+    are not doubly linked.  It is faster to single-link them, and
+    since chunks are never removed from the middles of these lists,
+    double linking is not necessary. Also, unlike regular bins, they
+    are not even processed in FIFO order (they use faster LIFO) since
+    ordering doesn't much matter in the transient contexts in which
+    fastbins are normally used.
+
+    Chunks in fastbins keep their inuse bit set, so they cannot
+    be consolidated with other free chunks. malloc_consolidate
+    releases all chunks in fastbins and consolidates them with
+    other free chunks.
+*/
+
+typedef struct malloc_chunk* mfastbinptr;
+
+/* offset 2 to use otherwise unindexable first 2 bins */
+#define fastbin_index(sz)        ((int)((((unsigned int)(sz)) >> 3) - 2))
+
+/* The maximum fastbin request size we support */
+#define MAX_FAST_SIZE     80
+
+#define NFASTBINS  (fastbin_index(request2size(MAX_FAST_SIZE))+1)
+
+/*
+  FASTBIN_CONSOLIDATION_THRESHOLD is the size of a chunk in free()
+  that triggers automatic consolidation of possibly-surrounding
+  fastbin chunks. This is a heuristic, so the exact value should not
+  matter too much. It is defined at half the default trim threshold as a
+  compromise heuristic to only attempt consolidation if it is likely
+  to lead to trimming. However, it is not dynamically tunable, since
+  consolidation reduces fragmentation surrounding large chunks even
+  if trimming is not used.
+*/
+
+#define FASTBIN_CONSOLIDATION_THRESHOLD  (65536UL)
+
+/*
+  Since the lowest 2 bits in max_fast don't matter in size comparisons,
+  they are used as flags.
+*/
+
+/*
+  FASTCHUNKS_BIT held in max_fast indicates that there are probably
+  some fastbin chunks. It is set true on entering a chunk into any
+  fastbin, and cleared only in malloc_consolidate.
+
+  The truth value is inverted so that have_fastchunks will be true
+  upon startup (since statics are zero-filled), simplifying
+  initialization checks.
+*/
+
+#define FASTCHUNKS_BIT        (1U)
+
+#define have_fastchunks(M)     (((M)->max_fast &  FASTCHUNKS_BIT) == 0)
+#define clear_fastchunks(M)    ((M)->max_fast |=  FASTCHUNKS_BIT)
+#define set_fastchunks(M)      ((M)->max_fast &= ~FASTCHUNKS_BIT)
+
+/*
+  NONCONTIGUOUS_BIT indicates that MORECORE does not return contiguous
+  regions.  Otherwise, contiguity is exploited in merging together,
+  when possible, results from consecutive MORECORE calls.
+
+  The initial value comes from MORECORE_CONTIGUOUS, but is
+  changed dynamically if mmap is ever used as an sbrk substitute.
+*/
+
+#define NONCONTIGUOUS_BIT     (2U)
+
+#define contiguous(M)          (((M)->max_fast &  NONCONTIGUOUS_BIT) == 0)
+#define noncontiguous(M)       (((M)->max_fast &  NONCONTIGUOUS_BIT) != 0)
+#define set_noncontiguous(M)   ((M)->max_fast |=  NONCONTIGUOUS_BIT)
+#define set_contiguous(M)      ((M)->max_fast &= ~NONCONTIGUOUS_BIT)
+
+/*
+   Set value of max_fast.
+   Use impossibly small value if 0.
+   Precondition: there are no existing fastbin chunks.
+   Setting the value clears fastchunk bit but preserves noncontiguous bit.
+*/
+
+#define set_max_fast(M, s) \
+  (M)->max_fast = (((s) == 0)? SMALLBIN_WIDTH: request2size(s)) | \
+  FASTCHUNKS_BIT | \
+  ((M)->max_fast &  NONCONTIGUOUS_BIT)
+
+
+/*
+   ----------- Internal state representation and initialization -----------
+*/
+
+struct malloc_state {
+  /* Serialize access.  */
+  mutex_t mutex;
+
+  /* Statistics for locking.  Only used if THREAD_STATS is defined.  */
+  long stat_lock_direct, stat_lock_loop, stat_lock_wait;
+  long pad0_[1]; /* try to give the mutex its own cacheline */
+
+  /* The maximum chunk size to be eligible for fastbin */
+  INTERNAL_SIZE_T  max_fast;   /* low 2 bits used as flags */
+
+  /* Fastbins */
+  mfastbinptr      fastbins[NFASTBINS];
+
+  /* Base of the topmost chunk -- not otherwise kept in a bin */
+  mchunkptr        top;
+
+  /* The remainder from the most recent split of a small request */
+  mchunkptr        last_remainder;
+
+  /* Normal bins packed as described above */
+  mchunkptr        bins[NBINS * 2];
+
+  /* Bitmap of bins */
+  unsigned int     binmap[BINMAPSIZE];
+
+  /* Linked list */
+  struct malloc_state *next;
+
+  /* Memory allocated from the system in this arena.  */
+  INTERNAL_SIZE_T system_mem;
+  INTERNAL_SIZE_T max_system_mem;
+};
+
+struct malloc_par {
+  /* Tunable parameters */
+  unsigned long    trim_threshold;
+  INTERNAL_SIZE_T  top_pad;
+  INTERNAL_SIZE_T  mmap_threshold;
+
+  /* Memory map support */
+  int              n_mmaps;
+  int              n_mmaps_max;
+  int              max_n_mmaps;
+
+  /* Cache malloc_getpagesize */
+  unsigned int     pagesize;
+
+  /* Statistics */
+  INTERNAL_SIZE_T  mmapped_mem;
+  /*INTERNAL_SIZE_T  sbrked_mem;*/
+  /*INTERNAL_SIZE_T  max_sbrked_mem;*/
+  INTERNAL_SIZE_T  max_mmapped_mem;
+  INTERNAL_SIZE_T  max_total_mem; /* only kept for NO_THREADS */
+
+  /* First address handed out by MORECORE/sbrk.  */
+  char*            sbrk_base;
+};
+
+/* There are several instances of this struct ("arenas") in this
+   malloc.  If you are adapting this malloc in a way that does NOT use
+   a static or mmapped malloc_state, you MUST explicitly zero-fill it
+   before using. This malloc relies on the property that malloc_state
+   is initialized to all zeroes (as is true of C statics).  */
+
+
+
+/*
+  Initialize a malloc_state struct.
+
+  This is called only from within malloc_consolidate, which needs
+  be called in the same contexts anyway.  It is never called directly
+  outside of malloc_consolidate because some optimizing compilers try
+  to inline it at all call points, which turns out not to be an
+  optimization at all. (Inlining it in malloc_consolidate is fine though.)
+*/
+
+#if __STD_C
+static void malloc_init_state(mstate av)
+#else
+static void malloc_init_state(av) mstate av;
+#endif
+{
+  int     i;
+  mbinptr bin;
+
+  /* Establish circular links for normal bins */
+  for (i = 1; i < NBINS; ++i) {
+    bin = bin_at(av,i);
+    bin->fd = bin->bk = bin;
+  }
+
+  set_noncontiguous(av);
+
+  set_max_fast(av, DEFAULT_MXFAST);
+
+  av->top            = initial_top(av);
+}
+
+/*
+   Other internal utilities operating on mstates
+*/
+
+#if __STD_C
+static Void_t*  sYSMALLOc(INTERNAL_SIZE_T, mstate);
+static void     malloc_consolidate(mstate);
+//static Void_t** iALLOc(mstate, size_t, size_t*, int, Void_t**);
+#else
+static Void_t*  sYSMALLOc();
+static void     malloc_consolidate();
+static Void_t** iALLOc();
+#endif
+
+/* ------------------- Support for multiple arenas -------------------- */
+#include "arena.c"
+
+/*
+  Debugging support
+
+  These routines make a number of assertions about the states
+  of data structures that should be true at all times. If any
+  are not true, it's very likely that a user program has somehow
+  trashed memory. (It's also possible that there is a coding error
+  in malloc. In which case, please report it!)
+*/
+
+#if ! MALLOC_DEBUG
+
+#define check_chunk(A,P)
+#define check_free_chunk(A,P)
+#define check_inuse_chunk(A,P)
+#define check_remalloced_chunk(A,P,N)
+#define check_malloced_chunk(A,P,N)
+#define check_malloc_state(A)
+
+#else
+
+#define check_chunk(A,P)              do_check_chunk(A,P)
+#define check_free_chunk(A,P)         do_check_free_chunk(A,P)
+#define check_inuse_chunk(A,P)        do_check_inuse_chunk(A,P)
+#define check_remalloced_chunk(A,P,N) do_check_remalloced_chunk(A,P,N)
+#define check_malloced_chunk(A,P,N)   do_check_malloced_chunk(A,P,N)
+#define check_malloc_state(A)         do_check_malloc_state(A)
+
+/*
+  Properties of all chunks
+*/
+
+#if __STD_C
+static void do_check_chunk(mstate av, mchunkptr p)
+#else
+static void do_check_chunk(av, p) mstate av; mchunkptr p;
+#endif
+{
+  unsigned long sz = chunksize(p);
+  /* min and max possible addresses assuming contiguous allocation */
+  char* max_address = (char*)(av->top) + chunksize(av->top);
+  char* min_address = max_address - av->system_mem;
+
+  if (!chunk_is_mmapped(p)) {
+
+    /* Has legal address ... */
+    if (p != av->top) {
+      if (contiguous(av)) {
+        assert(((char*)p) >= min_address);
+        assert(((char*)p + sz) <= ((char*)(av->top)));
+      }
+    }
+    else {
+      /* top size is always at least MINSIZE */
+      assert((unsigned long)(sz) >= MINSIZE);
+      /* top predecessor always marked inuse */
+      assert(prev_inuse(p));
+    }
+
+  }
+  else {
+#if HAVE_MMAP
+    /* address is outside main heap  */
+    if (contiguous(av) && av->top != initial_top(av)) {
+      assert(((char*)p) < min_address || ((char*)p) > max_address);
+    }
+    /* chunk is page-aligned */
+    assert(((p->prev_size + sz) & (mp_.pagesize-1)) == 0);
+    /* mem is aligned */
+    assert(aligned_OK(chunk2mem(p)));
+#else
+    /* force an appropriate assert violation if debug set */
+    assert(!chunk_is_mmapped(p));
+#endif
+  }
+}
+
+/*
+  Properties of free chunks
+*/
+
+#if __STD_C
+static void do_check_free_chunk(mstate av, mchunkptr p)
+#else
+static void do_check_free_chunk(av, p) mstate av; mchunkptr p;
+#endif
+{
+  INTERNAL_SIZE_T sz = p->size & ~(PREV_INUSE);
+  mchunkptr next = chunk_at_offset(p, sz);
+
+  do_check_chunk(av, p);
+
+  /* Chunk must claim to be free ... */
+  assert(!inuse(p));
+  assert (!chunk_is_mmapped(p));
+
+  /* Unless a special marker, must have OK fields */
+  if ((unsigned long)(sz) >= MINSIZE)
+  {
+    assert((sz & MALLOC_ALIGN_MASK) == 0);
+    assert(aligned_OK(chunk2mem(p)));
+    /* ... matching footer field */
+    assert(next->prev_size == sz);
+    /* ... and is fully consolidated */
+    assert(prev_inuse(p));
+    assert (next == av->top || inuse(next));
+
+    /* ... and has minimally sane links */
+    assert(p->fd->bk == p);
+    assert(p->bk->fd == p);
+  }
+  else /* markers are always of size SIZE_SZ */
+    assert(sz == SIZE_SZ);
+}
+
+/*
+  Properties of inuse chunks
+*/
+
+#if __STD_C
+static void do_check_inuse_chunk(mstate av, mchunkptr p)
+#else
+static void do_check_inuse_chunk(av, p) mstate av; mchunkptr p;
+#endif
+{
+  mchunkptr next;
+
+  do_check_chunk(av, p);
+
+  assert(av == arena_for_chunk(p));
+  if (chunk_is_mmapped(p))
+    return; /* mmapped chunks have no next/prev */
+
+  /* Check whether it claims to be in use ... */
+  assert(inuse(p));
+
+  next = next_chunk(p);
+
+  /* ... and is surrounded by OK chunks.
+    Since more things can be checked with free chunks than inuse ones,
+    if an inuse chunk borders them and debug is on, it's worth doing them.
+  */
+  if (!prev_inuse(p))  {
+    /* Note that we cannot even look at prev unless it is not inuse */
+    mchunkptr prv = prev_chunk(p);
+    assert(next_chunk(prv) == p);
+    do_check_free_chunk(av, prv);
+  }
+
+  if (next == av->top) {
+    assert(prev_inuse(next));
+    assert(chunksize(next) >= MINSIZE);
+  }
+  else if (!inuse(next))
+    do_check_free_chunk(av, next);
+}
+
+/*
+  Properties of chunks recycled from fastbins
+*/
+
+#if __STD_C
+static void do_check_remalloced_chunk(mstate av, mchunkptr p, INTERNAL_SIZE_T s)
+#else
+static void do_check_remalloced_chunk(av, p, s)
+mstate av; mchunkptr p; INTERNAL_SIZE_T s;
+#endif
+{
+  INTERNAL_SIZE_T sz = p->size & ~(PREV_INUSE);
+
+  if (!chunk_is_mmapped(p)) {
+    assert(av == arena_for_chunk(p));
+  }
+
+  do_check_inuse_chunk(av, p);
+
+  /* Legal size ... */
+  assert((sz & MALLOC_ALIGN_MASK) == 0);
+  assert((unsigned long)(sz) >= MINSIZE);
+  /* ... and alignment */
+  assert(aligned_OK(chunk2mem(p)));
+  /* chunk is less than MINSIZE more than request */
+  assert((long)(sz) - (long)(s) >= 0);
+  assert((long)(sz) - (long)(s + MINSIZE) < 0);
+}
+
+/*
+  Properties of nonrecycled chunks at the point they are malloced
+*/
+
+#if __STD_C
+static void do_check_malloced_chunk(mstate av, mchunkptr p, INTERNAL_SIZE_T s)
+#else
+static void do_check_malloced_chunk(av, p, s)
+mstate av; mchunkptr p; INTERNAL_SIZE_T s;
+#endif
+{
+  /* same as recycled case ... */
+  do_check_remalloced_chunk(av, p, s);
+
+  /*
+    ... plus,  must obey implementation invariant that prev_inuse is
+    always true of any allocated chunk; i.e., that each allocated
+    chunk borders either a previously allocated and still in-use
+    chunk, or the base of its memory arena. This is ensured
+    by making all allocations from the the `lowest' part of any found
+    chunk.  This does not necessarily hold however for chunks
+    recycled via fastbins.
+  */
+
+  assert(prev_inuse(p));
+}
+
+
+/*
+  Properties of malloc_state.
+
+  This may be useful for debugging malloc, as well as detecting user
+  programmer errors that somehow write into malloc_state.
+
+  If you are extending or experimenting with this malloc, you can
+  probably figure out how to hack this routine to print out or
+  display chunk addresses, sizes, bins, and other instrumentation.
+*/
+
+static void do_check_malloc_state(mstate av)
+{
+  int i;
+  mchunkptr p;
+  mchunkptr q;
+  mbinptr b;
+  unsigned int binbit;
+  int empty;
+  unsigned int idx;
+  INTERNAL_SIZE_T size;
+  unsigned long total = 0;
+  int max_fast_bin;
+
+  /* internal size_t must be no wider than pointer type */
+  assert(sizeof(INTERNAL_SIZE_T) <= sizeof(char*));
+
+  /* alignment is a power of 2 */
+  assert((MALLOC_ALIGNMENT & (MALLOC_ALIGNMENT-1)) == 0);
+
+  /* cannot run remaining checks until fully initialized */
+  if (av->top == 0 || av->top == initial_top(av))
+    return;
+
+
+  /* properties of fastbins */
+
+  /* max_fast is in allowed range */
+  assert((av->max_fast & ~1) <= request2size(MAX_FAST_SIZE));
+
+  max_fast_bin = fastbin_index(av->max_fast);
+
+  for (i = 0; i < NFASTBINS; ++i) {
+    p = av->fastbins[i];
+
+    /* all bins past max_fast are empty */
+    if (i > max_fast_bin)
+      assert(p == 0);
+
+    while (p != 0) {
+      /* each chunk claims to be inuse */
+      do_check_inuse_chunk(av, p);
+      total += chunksize(p);
+      /* chunk belongs in this bin */
+      assert(fastbin_index(chunksize(p)) == i);
+      p = p->fd;
+    }
+  }
+
+  if (total != 0)
+    assert(have_fastchunks(av));
+  else if (!have_fastchunks(av))
+    assert(total == 0);
+
+  /* check normal bins */
+  for (i = 1; i < NBINS; ++i) {
+    b = bin_at(av,i);
+
+    /* binmap is accurate (except for bin 1 == unsorted_chunks) */
+    if (i >= 2) {
+      binbit = get_binmap(av,i);
+      empty = last(b) == b;
+      if (!binbit)
+        assert(empty);
+      else if (!empty)
+        assert(binbit);
+    }
+
+    for (p = last(b); p != b; p = p->bk) {
+      /* each chunk claims to be free */
+      do_check_free_chunk(av, p);
+      size = chunksize(p);
+      total += size;
+      if (i >= 2) {
+        /* chunk belongs in bin */
+        idx = bin_index(size);
+        assert(idx == (unsigned int)i);
+        /* lists are sorted */
+        if ((unsigned long) size >= (unsigned long)(FIRST_SORTED_BIN_SIZE)) {
+	  assert(p->bk == b ||
+		 (unsigned long)chunksize(p->bk) >=
+		 (unsigned long)chunksize(p));
+	}
+      }
+      /* chunk is followed by a legal chain of inuse chunks */
+      for (q = next_chunk(p);
+           (q != av->top && inuse(q) &&
+             (unsigned long)(chunksize(q)) >= MINSIZE);
+           q = next_chunk(q))
+        do_check_inuse_chunk(av, q);
+    }
+  }
+
+  /* top chunk is OK */
+  check_chunk(av, av->top);
+
+  /* sanity checks for statistics */
+
+
+  assert((unsigned long)(av->system_mem) <=
+         (unsigned long)(av->max_system_mem));
+
+
+}
+#endif
+
+
+
+/* ----------- Routines dealing with system allocation -------------- */
+
+/* No system allocation routines supported */
+
+
+/*------------------------ Public wrappers. --------------------------------*/
+
+
+
+#undef DEBUG_MALLOC
+Void_t*
+public_mALLOc(cvmx_arena_list_t arena_list, size_t bytes)
+{
+  mstate ar_ptr, orig_ar_ptr;
+  Void_t *victim = NULL;
+  static mstate debug_prev_ar;  // debug only!
+#ifdef DEBUG_MALLOC
+  int arena_cnt=0;
+#endif
+  
+  ar_ptr = arena_list;
+
+  if (!ar_ptr)
+  {
+     return(NULL);
+  }
+
+  if (debug_prev_ar != ar_ptr)
+  {
+      debug_printf("New arena: %p\n", ar_ptr);
+#ifdef CVMX_SPINLOCK_DEBUG
+      cvmx_dprintf("lock wait count for arena: %p is %ld\n", ar_ptr, ar_ptr->mutex.wait_cnt);
+#endif
+      debug_prev_ar = ar_ptr;
+  }
+  orig_ar_ptr = ar_ptr;
+
+  // try to get an arena without contention
+  do
+  {
+#ifdef DEBUG_MALLOC
+  arena_cnt++;
+#endif
+      if (!mutex_trylock(&ar_ptr->mutex))
+      {
+          // we locked it
+          victim = _int_malloc(ar_ptr, bytes);
+          (void)mutex_unlock(&ar_ptr->mutex);
+          if(victim)
+          {
+              break;
+          }
+      }
+      ar_ptr = ar_ptr->next;
+  } while (ar_ptr != orig_ar_ptr);
+
+  // we couldn't get the memory without contention, so try all
+  // arenas.  SLOW!
+  if (!victim)
+  {
+      ar_ptr = orig_ar_ptr;
+      do
+      {
+#ifdef DEBUG_MALLOC
+  arena_cnt++;
+#endif
+          mutex_lock(&ar_ptr->mutex);
+          victim = _int_malloc(ar_ptr, bytes);
+          (void)mutex_unlock(&ar_ptr->mutex);
+          if(victim)
+          {
+              break;
+          }
+          ar_ptr = ar_ptr->next;
+      } while (ar_ptr != orig_ar_ptr);
+  }
+
+
+  assert(!victim || chunk_is_mmapped(mem2chunk(victim)) ||
+	 ar_ptr == arena_for_chunk(mem2chunk(victim)));
+
+#ifdef DEBUG_MALLOC
+  if (!victim)
+  {
+     cvmx_dprintf("Malloc failed: size: %ld, arena_cnt: %d\n", bytes, arena_cnt);
+  }
+#endif
+
+  debug_printf("cvmx_malloc(%ld) = %p\n", bytes, victim);
+
+  // remember which arena we last used.....
+  tsd_setspecific(arena_key, (Void_t *)ar_ptr);
+  return victim;
+}
+
+
+
+void
+public_fREe(Void_t* mem)
+{
+  mstate ar_ptr;
+  mchunkptr p;                          /* chunk corresponding to mem */
+
+  debug_printf("cvmx_free(%p)\n", mem);
+
+
+  if (mem == 0)                              /* free(0) has no effect */
+    return;
+
+  p = mem2chunk(mem);
+
+
+  ar_ptr = arena_for_chunk(p);
+  assert(ar_ptr);
+#if THREAD_STATS
+  if(!mutex_trylock(&ar_ptr->mutex))
+    ++(ar_ptr->stat_lock_direct);
+  else {
+    (void)mutex_lock(&ar_ptr->mutex);
+    ++(ar_ptr->stat_lock_wait);
+  }
+#else
+  (void)mutex_lock(&ar_ptr->mutex);
+#endif
+  _int_free(ar_ptr, mem);
+  (void)mutex_unlock(&ar_ptr->mutex);
+}
+
+Void_t*
+public_rEALLOc(cvmx_arena_list_t arena_list, Void_t* oldmem, size_t bytes)
+{
+  mstate ar_ptr;
+  INTERNAL_SIZE_T    nb;      /* padded request size */
+
+  mchunkptr oldp;             /* chunk corresponding to oldmem */
+  INTERNAL_SIZE_T    oldsize; /* its size */
+
+  Void_t* newp;             /* chunk to return */
+
+
+#if REALLOC_ZERO_BYTES_FREES
+  if (bytes == 0 && oldmem != NULL) { public_fREe(oldmem); return 0; }
+#endif
+
+  /* realloc of null is supposed to be same as malloc */
+  if (oldmem == 0) return public_mALLOc(arena_list, bytes);
+
+  oldp    = mem2chunk(oldmem);
+  oldsize = chunksize(oldp);
+
+  checked_request2size(bytes, nb);
+
+
+  ar_ptr = arena_for_chunk(oldp);
+  (void)mutex_lock(&ar_ptr->mutex);
+
+
+  newp = _int_realloc(ar_ptr, oldmem, bytes);
+
+  (void)mutex_unlock(&ar_ptr->mutex);
+  assert(!newp || chunk_is_mmapped(mem2chunk(newp)) ||
+	 ar_ptr == arena_for_chunk(mem2chunk(newp)));
+  return newp;
+}
+
+#undef DEBUG_MEMALIGN
+Void_t*
+public_mEMALIGn(cvmx_arena_list_t arena_list, size_t alignment, size_t bytes)
+{
+  mstate ar_ptr, orig_ar_ptr;
+  Void_t *p = NULL;
+#ifdef DEBUG_MEMALIGN
+  int arena_cnt=0;
+#endif
+
+
+  /* If need less alignment than we give anyway, just relay to malloc */
+  if (alignment <= MALLOC_ALIGNMENT) return public_mALLOc(arena_list, bytes);
+
+  /* Otherwise, ensure that it is at least a minimum chunk size */
+  if (alignment <  MINSIZE) alignment = MINSIZE;
+
+
+  ar_ptr = arena_list;
+
+  if (!ar_ptr)
+  {
+     return(NULL);
+  }
+
+  orig_ar_ptr = ar_ptr;
+
+
+  // try to get an arena without contention
+  do
+  {
+
+#ifdef DEBUG_MEMALIGN
+   arena_cnt++;
+#endif
+      if (!mutex_trylock(&ar_ptr->mutex))
+      {
+          // we locked it
+          p = _int_memalign(ar_ptr, alignment, bytes);
+          (void)mutex_unlock(&ar_ptr->mutex);
+          if(p)
+          {
+              break;
+          }
+      }
+      ar_ptr = ar_ptr->next;
+  } while (ar_ptr != orig_ar_ptr);
+
+
+  // we couldn't get the memory without contention, so try all
+  // arenas.  SLOW!
+  if (!p)
+  {
+#ifdef DEBUG_MEMALIGN
+   arena_cnt++;
+#endif
+      ar_ptr = orig_ar_ptr;
+      do
+      {
+          mutex_lock(&ar_ptr->mutex);
+          p = _int_memalign(ar_ptr, alignment, bytes);
+          (void)mutex_unlock(&ar_ptr->mutex);
+          if(p)
+          {
+              break;
+          }
+          ar_ptr = ar_ptr->next;
+      } while (ar_ptr != orig_ar_ptr);
+  }
+
+
+  if (p)
+  {
+     assert(ar_ptr == arena_for_chunk(mem2chunk(p)));
+  }
+  else
+  {
+#ifdef DEBUG_MEMALIGN
+     cvmx_dprintf("Memalign failed: align: 0x%x, size: %ld, arena_cnt: %ld\n", alignment, bytes, arena_cnt);
+#endif
+  }
+
+  assert(!p || ar_ptr == arena_for_chunk(mem2chunk(p)));
+  return p;
+}
+
+
+
+Void_t*
+public_cALLOc(cvmx_arena_list_t arena_list, size_t n, size_t elem_size)
+{
+  mstate av;
+  mchunkptr oldtop, p;
+  INTERNAL_SIZE_T sz, csz, oldtopsize;
+  Void_t* mem;
+  unsigned long clearsize;
+  unsigned long nclears;
+  INTERNAL_SIZE_T* d;
+
+
+  /* FIXME: check for overflow on multiplication.  */
+  sz = n * elem_size;
+
+  mem = public_mALLOc(arena_list, sz);
+  if (mem)
+  {
+     memset(mem, 0, sz);
+  }
+
+  return mem;
+}
+
+
+#ifndef _LIBC
+
+void
+public_cFREe(Void_t* m)
+{
+  public_fREe(m);
+}
+
+#endif /* _LIBC */
+
+/*
+  ------------------------------ malloc ------------------------------
+*/
+
+static Void_t*
+_int_malloc(mstate av, size_t bytes)
+{
+  INTERNAL_SIZE_T nb;               /* normalized request size */
+  unsigned int    idx;              /* associated bin index */
+  mbinptr         bin;              /* associated bin */
+  mfastbinptr*    fb;               /* associated fastbin */
+
+  mchunkptr       victim;           /* inspected/selected chunk */
+  INTERNAL_SIZE_T size;             /* its size */
+  int             victim_index;     /* its bin index */
+
+  mchunkptr       remainder;        /* remainder from a split */
+  unsigned long   remainder_size;   /* its size */
+
+  unsigned int    block;            /* bit map traverser */
+  unsigned int    bit;              /* bit map traverser */
+  unsigned int    map;              /* current word of binmap */
+
+  mchunkptr       fwd;              /* misc temp for linking */
+  mchunkptr       bck;              /* misc temp for linking */
+
+  /*
+    Convert request size to internal form by adding SIZE_SZ bytes
+    overhead plus possibly more to obtain necessary alignment and/or
+    to obtain a size of at least MINSIZE, the smallest allocatable
+    size. Also, checked_request2size traps (returning 0) request sizes
+    that are so large that they wrap around zero when padded and
+    aligned.
+  */
+
+
+  checked_request2size(bytes, nb);
+
+  /*
+    If the size qualifies as a fastbin, first check corresponding bin.
+    This code is safe to execute even if av is not yet initialized, so we
+    can try it without checking, which saves some time on this fast path.
+  */
+
+  if ((unsigned long)(nb) <= (unsigned long)(av->max_fast)) {
+    fb = &(av->fastbins[(fastbin_index(nb))]);
+    if ( (victim = *fb) != 0) {
+      *fb = victim->fd;
+      check_remalloced_chunk(av, victim, nb);
+      set_arena_for_chunk(victim, av);
+      return chunk2mem(victim);
+    }
+  }
+
+  /*
+    If a small request, check regular bin.  Since these "smallbins"
+    hold one size each, no searching within bins is necessary.
+    (For a large request, we need to wait until unsorted chunks are
+    processed to find best fit. But for small ones, fits are exact
+    anyway, so we can check now, which is faster.)
+  */
+
+  if (in_smallbin_range(nb)) {
+    idx = smallbin_index(nb);
+    bin = bin_at(av,idx);
+
+    if ( (victim = last(bin)) != bin) {
+      if (victim == 0) /* initialization check */
+        malloc_consolidate(av);
+      else {
+        bck = victim->bk;
+        set_inuse_bit_at_offset(victim, nb);
+        bin->bk = bck;
+        bck->fd = bin;
+
+        set_arena_for_chunk(victim, av);
+        check_malloced_chunk(av, victim, nb);
+        return chunk2mem(victim);
+      }
+    }
+  }
+
+  /*
+     If this is a large request, consolidate fastbins before continuing.
+     While it might look excessive to kill all fastbins before
+     even seeing if there is space available, this avoids
+     fragmentation problems normally associated with fastbins.
+     Also, in practice, programs tend to have runs of either small or
+     large requests, but less often mixtures, so consolidation is not
+     invoked all that often in most programs. And the programs that
+     it is called frequently in otherwise tend to fragment.
+  */
+
+  else {
+    idx = largebin_index(nb);
+    if (have_fastchunks(av))
+      malloc_consolidate(av);
+  }
+
+  /*
+    Process recently freed or remaindered chunks, taking one only if
+    it is exact fit, or, if this a small request, the chunk is remainder from
+    the most recent non-exact fit.  Place other traversed chunks in
+    bins.  Note that this step is the only place in any routine where
+    chunks are placed in bins.
+
+    The outer loop here is needed because we might not realize until
+    near the end of malloc that we should have consolidated, so must
+    do so and retry. This happens at most once, and only when we would
+    otherwise need to expand memory to service a "small" request.
+  */
+
+  for(;;) {
+
+    while ( (victim = unsorted_chunks(av)->bk) != unsorted_chunks(av)) {
+      bck = victim->bk;
+      size = chunksize(victim);
+
+      /*
+         If a small request, try to use last remainder if it is the
+         only chunk in unsorted bin.  This helps promote locality for
+         runs of consecutive small requests. This is the only
+         exception to best-fit, and applies only when there is
+         no exact fit for a small chunk.
+      */
+
+      if (in_smallbin_range(nb) &&
+          bck == unsorted_chunks(av) &&
+          victim == av->last_remainder &&
+          (unsigned long)(size) > (unsigned long)(nb + MINSIZE)) {
+
+        /* split and reattach remainder */
+        remainder_size = size - nb;
+        remainder = chunk_at_offset(victim, nb);
+        unsorted_chunks(av)->bk = unsorted_chunks(av)->fd = remainder;
+        av->last_remainder = remainder;
+        remainder->bk = remainder->fd = unsorted_chunks(av);
+
+        set_head(victim, nb | PREV_INUSE);
+        set_head(remainder, remainder_size | PREV_INUSE);
+        set_foot(remainder, remainder_size);
+
+        set_arena_for_chunk(victim, av);
+        check_malloced_chunk(av, victim, nb);
+        return chunk2mem(victim);
+      }
+
+      /* remove from unsorted list */
+      unsorted_chunks(av)->bk = bck;
+      bck->fd = unsorted_chunks(av);
+
+      /* Take now instead of binning if exact fit */
+
+      if (size == nb) {
+        set_inuse_bit_at_offset(victim, size);
+        set_arena_for_chunk(victim, av);
+        check_malloced_chunk(av, victim, nb);
+        return chunk2mem(victim);
+      }
+
+      /* place chunk in bin */
+
+      if (in_smallbin_range(size)) {
+        victim_index = smallbin_index(size);
+        bck = bin_at(av, victim_index);
+        fwd = bck->fd;
+      }
+      else {
+        victim_index = largebin_index(size);
+        bck = bin_at(av, victim_index);
+        fwd = bck->fd;
+
+        if (fwd != bck) {
+          /* if smaller than smallest, place first */
+          if ((unsigned long)(size) < (unsigned long)(bck->bk->size)) {
+            fwd = bck;
+            bck = bck->bk;
+          }
+          else if ((unsigned long)(size) >= 
+                   (unsigned long)(FIRST_SORTED_BIN_SIZE)) {
+
+            /* maintain large bins in sorted order */
+            size |= PREV_INUSE; /* Or with inuse bit to speed comparisons */
+            while ((unsigned long)(size) < (unsigned long)(fwd->size)) {
+              fwd = fwd->fd;
+	    }
+            bck = fwd->bk;
+          }
+        }
+      }
+
+      mark_bin(av, victim_index);
+      victim->bk = bck;
+      victim->fd = fwd;
+      fwd->bk = victim;
+      bck->fd = victim;
+    }
+
+    /*
+      If a large request, scan through the chunks of current bin in
+      sorted order to find smallest that fits.  This is the only step
+      where an unbounded number of chunks might be scanned without doing
+      anything useful with them. However the lists tend to be short.
+    */
+
+    if (!in_smallbin_range(nb)) {
+      bin = bin_at(av, idx);
+
+      for (victim = last(bin); victim != bin; victim = victim->bk) {
+	size = chunksize(victim);
+
+	if ((unsigned long)(size) >= (unsigned long)(nb)) {
+	  remainder_size = size - nb;
+	  unlink(victim, bck, fwd);
+
+	  /* Exhaust */
+	  if (remainder_size < MINSIZE)  {
+	    set_inuse_bit_at_offset(victim, size);
+        set_arena_for_chunk(victim, av);
+	    check_malloced_chunk(av, victim, nb);
+	    return chunk2mem(victim);
+	  }
+	  /* Split */
+	  else {
+	    remainder = chunk_at_offset(victim, nb);
+	    unsorted_chunks(av)->bk = unsorted_chunks(av)->fd = remainder;
+	    remainder->bk = remainder->fd = unsorted_chunks(av);
+	    set_head(victim, nb | PREV_INUSE);
+	    set_head(remainder, remainder_size | PREV_INUSE);
+	    set_foot(remainder, remainder_size);
+        set_arena_for_chunk(victim, av);
+	    check_malloced_chunk(av, victim, nb);
+	    return chunk2mem(victim);
+	  }
+	}
+      }
+    }
+
+    /*
+      Search for a chunk by scanning bins, starting with next largest
+      bin. This search is strictly by best-fit; i.e., the smallest
+      (with ties going to approximately the least recently used) chunk
+      that fits is selected.
+
+      The bitmap avoids needing to check that most blocks are nonempty.
+      The particular case of skipping all bins during warm-up phases
+      when no chunks have been returned yet is faster than it might look.
+    */
+
+    ++idx;
+    bin = bin_at(av,idx);
+    block = idx2block(idx);
+    map = av->binmap[block];
+    bit = idx2bit(idx);
+
+    for (;;) {
+
+      /* Skip rest of block if there are no more set bits in this block.  */
+      if (bit > map || bit == 0) {
+        do {
+          if (++block >= BINMAPSIZE)  /* out of bins */
+            goto use_top;
+        } while ( (map = av->binmap[block]) == 0);
+
+        bin = bin_at(av, (block << BINMAPSHIFT));
+        bit = 1;
+      }
+
+      /* Advance to bin with set bit. There must be one. */
+      while ((bit & map) == 0) {
+        bin = next_bin(bin);
+        bit <<= 1;
+        assert(bit != 0);
+      }
+
+      /* Inspect the bin. It is likely to be non-empty */
+      victim = last(bin);
+
+      /*  If a false alarm (empty bin), clear the bit. */
+      if (victim == bin) {
+        av->binmap[block] = map &= ~bit; /* Write through */
+        bin = next_bin(bin);
+        bit <<= 1;
+      }
+
+      else {
+        size = chunksize(victim);
+
+        /*  We know the first chunk in this bin is big enough to use. */
+        assert((unsigned long)(size) >= (unsigned long)(nb));
+
+        remainder_size = size - nb;
+
+        /* unlink */
+        bck = victim->bk;
+        bin->bk = bck;
+        bck->fd = bin;
+
+        /* Exhaust */
+        if (remainder_size < MINSIZE) {
+          set_inuse_bit_at_offset(victim, size);
+          set_arena_for_chunk(victim, av);
+          check_malloced_chunk(av, victim, nb);
+          return chunk2mem(victim);
+        }
+
+        /* Split */
+        else {
+          remainder = chunk_at_offset(victim, nb);
+
+          unsorted_chunks(av)->bk = unsorted_chunks(av)->fd = remainder;
+          remainder->bk = remainder->fd = unsorted_chunks(av);
+          /* advertise as last remainder */
+          if (in_smallbin_range(nb))
+            av->last_remainder = remainder;
+
+          set_head(victim, nb | PREV_INUSE);
+          set_head(remainder, remainder_size | PREV_INUSE);
+          set_foot(remainder, remainder_size);
+          set_arena_for_chunk(victim, av);
+          check_malloced_chunk(av, victim, nb);
+          return chunk2mem(victim);
+        }
+      }
+    }
+
+  use_top:
+    /*
+      If large enough, split off the chunk bordering the end of memory
+      (held in av->top). Note that this is in accord with the best-fit
+      search rule.  In effect, av->top is treated as larger (and thus
+      less well fitting) than any other available chunk since it can
+      be extended to be as large as necessary (up to system
+      limitations).
+
+      We require that av->top always exists (i.e., has size >=
+      MINSIZE) after initialization, so if it would otherwise be
+      exhuasted by current request, it is replenished. (The main
+      reason for ensuring it exists is that we may need MINSIZE space
+      to put in fenceposts in sysmalloc.)
+    */
+
+    victim = av->top;
+    size = chunksize(victim);
+
+    if ((unsigned long)(size) >= (unsigned long)(nb + MINSIZE)) {
+      remainder_size = size - nb;
+      remainder = chunk_at_offset(victim, nb);
+      av->top = remainder;
+      set_head(victim, nb | PREV_INUSE);
+      set_head(remainder, remainder_size | PREV_INUSE);
+
+      set_arena_for_chunk(victim, av);
+      check_malloced_chunk(av, victim, nb);
+      return chunk2mem(victim);
+    }
+
+    /*
+      If there is space available in fastbins, consolidate and retry,
+      to possibly avoid expanding memory. This can occur only if nb is
+      in smallbin range so we didn't consolidate upon entry.
+    */
+
+    else if (have_fastchunks(av)) {
+      assert(in_smallbin_range(nb));
+      malloc_consolidate(av);
+      idx = smallbin_index(nb); /* restore original bin index */
+    }
+
+    /*
+       Otherwise, relay to handle system-dependent cases
+    */
+    else
+      return(NULL); // sysmalloc not supported
+  }
+}
+
+/*
+  ------------------------------ free ------------------------------
+*/
+
+static void
+_int_free(mstate av, Void_t* mem)
+{
+  mchunkptr       p;           /* chunk corresponding to mem */
+  INTERNAL_SIZE_T size;        /* its size */
+  mfastbinptr*    fb;          /* associated fastbin */
+  mchunkptr       nextchunk;   /* next contiguous chunk */
+  INTERNAL_SIZE_T nextsize;    /* its size */
+  int             nextinuse;   /* true if nextchunk is used */
+  INTERNAL_SIZE_T prevsize;    /* size of previous contiguous chunk */
+  mchunkptr       bck;         /* misc temp for linking */
+  mchunkptr       fwd;         /* misc temp for linking */
+
+
+  /* free(0) has no effect */
+  if (mem != 0) {
+    p = mem2chunk(mem);
+    size = chunksize(p);
+
+    check_inuse_chunk(av, p);
+
+    /*
+      If eligible, place chunk on a fastbin so it can be found
+      and used quickly in malloc.
+    */
+
+    if ((unsigned long)(size) <= (unsigned long)(av->max_fast)
+
+#if TRIM_FASTBINS
+        /*
+           If TRIM_FASTBINS set, don't place chunks
+           bordering top into fastbins
+        */
+        && (chunk_at_offset(p, size) != av->top)
+#endif
+        ) {
+
+      set_fastchunks(av);
+      fb = &(av->fastbins[fastbin_index(size)]);
+      p->fd = *fb;
+      *fb = p;
+    }
+
+    /*
+       Consolidate other non-mmapped chunks as they arrive.
+    */
+
+    else if (!chunk_is_mmapped(p)) {
+      nextchunk = chunk_at_offset(p, size);
+      nextsize = chunksize(nextchunk);
+      assert(nextsize > 0);
+
+      /* consolidate backward */
+      if (!prev_inuse(p)) {
+        prevsize = p->prev_size;
+        size += prevsize;
+        p = chunk_at_offset(p, -((long) prevsize));
+        unlink(p, bck, fwd);
+      }
+
+      if (nextchunk != av->top) {
+        /* get and clear inuse bit */
+        nextinuse = inuse_bit_at_offset(nextchunk, nextsize);
+
+        /* consolidate forward */
+        if (!nextinuse) {
+          unlink(nextchunk, bck, fwd);
+          size += nextsize;
+        } else
+	  clear_inuse_bit_at_offset(nextchunk, 0);
+
+        /*
+          Place the chunk in unsorted chunk list. Chunks are
+          not placed into regular bins until after they have
+          been given one chance to be used in malloc.
+        */
+
+        bck = unsorted_chunks(av);
+        fwd = bck->fd;
+        p->bk = bck;
+        p->fd = fwd;
+        bck->fd = p;
+        fwd->bk = p;
+
+        set_head(p, size | PREV_INUSE);
+        set_foot(p, size);
+
+        check_free_chunk(av, p);
+      }
+
+      /*
+         If the chunk borders the current high end of memory,
+         consolidate into top
+      */
+
+      else {
+        size += nextsize;
+        set_head(p, size | PREV_INUSE);
+        av->top = p;
+        check_chunk(av, p);
+      }
+
+      /*
+        If freeing a large space, consolidate possibly-surrounding
+        chunks. Then, if the total unused topmost memory exceeds trim
+        threshold, ask malloc_trim to reduce top.
+
+        Unless max_fast is 0, we don't know if there are fastbins
+        bordering top, so we cannot tell for sure whether threshold
+        has been reached unless fastbins are consolidated.  But we
+        don't want to consolidate on each free.  As a compromise,
+        consolidation is performed if FASTBIN_CONSOLIDATION_THRESHOLD
+        is reached.
+      */
+
+      if ((unsigned long)(size) >= FASTBIN_CONSOLIDATION_THRESHOLD) {
+        if (have_fastchunks(av))
+          malloc_consolidate(av);
+      }
+    }
+  }
+}
+
+/*
+  ------------------------- malloc_consolidate -------------------------
+
+  malloc_consolidate is a specialized version of free() that tears
+  down chunks held in fastbins.  Free itself cannot be used for this
+  purpose since, among other things, it might place chunks back onto
+  fastbins.  So, instead, we need to use a minor variant of the same
+  code.
+
+  Also, because this routine needs to be called the first time through
+  malloc anyway, it turns out to be the perfect place to trigger
+  initialization code.
+*/
+
+#if __STD_C
+static void malloc_consolidate(mstate av)
+#else
+static void malloc_consolidate(av) mstate av;
+#endif
+{
+  mfastbinptr*    fb;                 /* current fastbin being consolidated */
+  mfastbinptr*    maxfb;              /* last fastbin (for loop control) */
+  mchunkptr       p;                  /* current chunk being consolidated */
+  mchunkptr       nextp;              /* next chunk to consolidate */
+  mchunkptr       unsorted_bin;       /* bin header */
+  mchunkptr       first_unsorted;     /* chunk to link to */
+
+  /* These have same use as in free() */
+  mchunkptr       nextchunk;
+  INTERNAL_SIZE_T size;
+  INTERNAL_SIZE_T nextsize;
+  INTERNAL_SIZE_T prevsize;
+  int             nextinuse;
+  mchunkptr       bck;
+  mchunkptr       fwd;
+
+  /*
+    If max_fast is 0, we know that av hasn't
+    yet been initialized, in which case do so below
+  */
+
+  if (av->max_fast != 0) {
+    clear_fastchunks(av);
+
+    unsorted_bin = unsorted_chunks(av);
+
+    /*
+      Remove each chunk from fast bin and consolidate it, placing it
+      then in unsorted bin. Among other reasons for doing this,
+      placing in unsorted bin avoids needing to calculate actual bins
+      until malloc is sure that chunks aren't immediately going to be
+      reused anyway.
+    */
+
+    maxfb = &(av->fastbins[fastbin_index(av->max_fast)]);
+    fb = &(av->fastbins[0]);
+    do {
+      if ( (p = *fb) != 0) {
+        *fb = 0;
+
+        do {
+          check_inuse_chunk(av, p);
+          nextp = p->fd;
+
+          /* Slightly streamlined version of consolidation code in free() */
+          size = p->size & ~(PREV_INUSE);
+          nextchunk = chunk_at_offset(p, size);
+          nextsize = chunksize(nextchunk);
+
+          if (!prev_inuse(p)) {
+            prevsize = p->prev_size;
+            size += prevsize;
+            p = chunk_at_offset(p, -((long) prevsize));
+            unlink(p, bck, fwd);
+          }
+
+          if (nextchunk != av->top) {
+            nextinuse = inuse_bit_at_offset(nextchunk, nextsize);
+
+            if (!nextinuse) {
+              size += nextsize;
+              unlink(nextchunk, bck, fwd);
+            } else
+	      clear_inuse_bit_at_offset(nextchunk, 0);
+
+            first_unsorted = unsorted_bin->fd;
+            unsorted_bin->fd = p;
+            first_unsorted->bk = p;
+
+            set_head(p, size | PREV_INUSE);
+            p->bk = unsorted_bin;
+            p->fd = first_unsorted;
+            set_foot(p, size);
+          }
+
+          else {
+            size += nextsize;
+            set_head(p, size | PREV_INUSE);
+            av->top = p;
+          }
+
+        } while ( (p = nextp) != 0);
+
+      }
+    } while (fb++ != maxfb);
+  }
+  else {
+    malloc_init_state(av);
+    check_malloc_state(av);
+  }
+}
+
+/*
+  ------------------------------ realloc ------------------------------
+*/
+
+static Void_t*
+_int_realloc(mstate av, Void_t* oldmem, size_t bytes)
+{
+  INTERNAL_SIZE_T  nb;              /* padded request size */
+
+  mchunkptr        oldp;            /* chunk corresponding to oldmem */
+  INTERNAL_SIZE_T  oldsize;         /* its size */
+
+  mchunkptr        newp;            /* chunk to return */
+  INTERNAL_SIZE_T  newsize;         /* its size */
+  Void_t*          newmem;          /* corresponding user mem */
+
+  mchunkptr        next;            /* next contiguous chunk after oldp */
+
+  mchunkptr        remainder;       /* extra space at end of newp */
+  unsigned long    remainder_size;  /* its size */
+
+  mchunkptr        bck;             /* misc temp for linking */
+  mchunkptr        fwd;             /* misc temp for linking */
+
+  unsigned long    copysize;        /* bytes to copy */
+  unsigned int     ncopies;         /* INTERNAL_SIZE_T words to copy */
+  INTERNAL_SIZE_T* s;               /* copy source */
+  INTERNAL_SIZE_T* d;               /* copy destination */
+
+
+#if REALLOC_ZERO_BYTES_FREES
+  if (bytes == 0) {
+    _int_free(av, oldmem);
+    return 0;
+  }
+#endif
+
+  /* realloc of null is supposed to be same as malloc */
+  if (oldmem == 0) return _int_malloc(av, bytes);
+
+  checked_request2size(bytes, nb);
+
+  oldp    = mem2chunk(oldmem);
+  oldsize = chunksize(oldp);
+
+  check_inuse_chunk(av, oldp);
+
+  // force to act like not mmapped
+  if (1) {
+
+    if ((unsigned long)(oldsize) >= (unsigned long)(nb)) {
+      /* already big enough; split below */
+      newp = oldp;
+      newsize = oldsize;
+    }
+
+    else {
+      next = chunk_at_offset(oldp, oldsize);
+
+      /* Try to expand forward into top */
+      if (next == av->top &&
+          (unsigned long)(newsize = oldsize + chunksize(next)) >=
+          (unsigned long)(nb + MINSIZE)) {
+        set_head_size(oldp, nb );
+        av->top = chunk_at_offset(oldp, nb);
+        set_head(av->top, (newsize - nb) | PREV_INUSE);
+    	check_inuse_chunk(av, oldp);
+        set_arena_for_chunk(oldp, av);
+        return chunk2mem(oldp);
+      }
+
+      /* Try to expand forward into next chunk;  split off remainder below */
+      else if (next != av->top &&
+               !inuse(next) &&
+               (unsigned long)(newsize = oldsize + chunksize(next)) >=
+               (unsigned long)(nb)) {
+        newp = oldp;
+        unlink(next, bck, fwd);
+      }
+
+      /* allocate, copy, free */
+      else {
+        newmem = _int_malloc(av, nb - MALLOC_ALIGN_MASK);
+        if (newmem == 0)
+          return 0; /* propagate failure */
+
+        newp = mem2chunk(newmem);
+        newsize = chunksize(newp);
+
+        /*
+          Avoid copy if newp is next chunk after oldp.
+        */
+        if (newp == next) {
+          newsize += oldsize;
+          newp = oldp;
+        }
+        else {
+          /*
+            Unroll copy of <= 36 bytes (72 if 8byte sizes)
+            We know that contents have an odd number of
+            INTERNAL_SIZE_T-sized words; minimally 3.
+          */
+
+          copysize = oldsize - SIZE_SZ;
+          s = (INTERNAL_SIZE_T*)(oldmem);
+          d = (INTERNAL_SIZE_T*)(newmem);
+          ncopies = copysize / sizeof(INTERNAL_SIZE_T);
+          assert(ncopies >= 3);
+
+          if (ncopies > 9)
+            MALLOC_COPY(d, s, copysize);
+
+          else {
+            *(d+0) = *(s+0);
+            *(d+1) = *(s+1);
+            *(d+2) = *(s+2);
+            if (ncopies > 4) {
+              *(d+3) = *(s+3);
+              *(d+4) = *(s+4);
+              if (ncopies > 6) {
+                *(d+5) = *(s+5);
+                *(d+6) = *(s+6);
+                if (ncopies > 8) {
+                  *(d+7) = *(s+7);
+                  *(d+8) = *(s+8);
+                }
+              }
+            }
+          }
+
+          _int_free(av, oldmem);
+          set_arena_for_chunk(newp, av);
+          check_inuse_chunk(av, newp);
+          return chunk2mem(newp);
+        }
+      }
+    }
+
+    /* If possible, free extra space in old or extended chunk */
+
+    assert((unsigned long)(newsize) >= (unsigned long)(nb));
+
+    remainder_size = newsize - nb;
+
+    if (remainder_size < MINSIZE) { /* not enough extra to split off */
+      set_head_size(newp, newsize);
+      set_inuse_bit_at_offset(newp, newsize);
+    }
+    else { /* split remainder */
+      remainder = chunk_at_offset(newp, nb);
+      set_head_size(newp, nb );
+      set_head(remainder, remainder_size | PREV_INUSE );
+      /* Mark remainder as inuse so free() won't complain */
+      set_inuse_bit_at_offset(remainder, remainder_size);
+      set_arena_for_chunk(remainder, av);
+      _int_free(av, chunk2mem(remainder));
+    }
+
+    set_arena_for_chunk(newp, av);
+    check_inuse_chunk(av, newp);
+    return chunk2mem(newp);
+  }
+
+  /*
+    Handle mmap cases
+  */
+
+  else {
+    /* If !HAVE_MMAP, but chunk_is_mmapped, user must have overwritten mem */
+    check_malloc_state(av);
+    MALLOC_FAILURE_ACTION;
+    return 0;
+  }
+}
+
+/*
+  ------------------------------ memalign ------------------------------
+*/
+
+static Void_t*
+_int_memalign(mstate av, size_t alignment, size_t bytes)
+{
+  INTERNAL_SIZE_T nb;             /* padded  request size */
+  char*           m;              /* memory returned by malloc call */
+  mchunkptr       p;              /* corresponding chunk */
+  char*           brk;            /* alignment point within p */
+  mchunkptr       newp;           /* chunk to return */
+  INTERNAL_SIZE_T newsize;        /* its size */
+  INTERNAL_SIZE_T leadsize;       /* leading space before alignment point */
+  mchunkptr       remainder;      /* spare room at end to split off */
+  unsigned long   remainder_size; /* its size */
+  INTERNAL_SIZE_T size;
+
+  /* If need less alignment than we give anyway, just relay to malloc */
+
+  if (alignment <= MALLOC_ALIGNMENT) return _int_malloc(av, bytes);
+
+  /* Otherwise, ensure that it is at least a minimum chunk size */
+
+  if (alignment <  MINSIZE) alignment = MINSIZE;
+
+  /* Make sure alignment is power of 2 (in case MINSIZE is not).  */
+  if ((alignment & (alignment - 1)) != 0) {
+    size_t a = MALLOC_ALIGNMENT * 2;
+    while ((unsigned long)a < (unsigned long)alignment) a <<= 1;
+    alignment = a;
+  }
+
+  checked_request2size(bytes, nb);
+
+  /*
+    Strategy: find a spot within that chunk that meets the alignment
+    request, and then possibly free the leading and trailing space.
+  */
+
+
+  /* Call malloc with worst case padding to hit alignment. */
+
+  m  = (char*)(_int_malloc(av, nb + alignment + MINSIZE));
+
+  if (m == 0) return 0; /* propagate failure */
+
+  p = mem2chunk(m);
+
+  if ((((unsigned long)(m)) % alignment) != 0) { /* misaligned */
+
+    /*
+      Find an aligned spot inside chunk.  Since we need to give back
+      leading space in a chunk of at least MINSIZE, if the first
+      calculation places us at a spot with less than MINSIZE leader,
+      we can move to the next aligned spot -- we've allocated enough
+      total room so that this is always possible.
+    */
+
+    brk = (char*)mem2chunk(((unsigned long)(m + alignment - 1)) &
+                           -((signed long) alignment));
+    if ((unsigned long)(brk - (char*)(p)) < MINSIZE)
+      brk += alignment;
+
+    newp = (mchunkptr)brk;
+    leadsize = brk - (char*)(p);
+    newsize = chunksize(p) - leadsize;
+
+    /* For mmapped chunks, just adjust offset */
+    if (chunk_is_mmapped(p)) {
+      newp->prev_size = p->prev_size + leadsize;
+      set_head(newp, newsize|IS_MMAPPED);
+      set_arena_for_chunk(newp, av);
+      return chunk2mem(newp);
+    }
+
+    /* Otherwise, give back leader, use the rest */
+    set_head(newp, newsize | PREV_INUSE );
+    set_inuse_bit_at_offset(newp, newsize);
+    set_head_size(p, leadsize);
+    set_arena_for_chunk(p, av);
+    _int_free(av, chunk2mem(p));
+    p = newp;
+
+    assert (newsize >= nb &&
+            (((unsigned long)(chunk2mem(p))) % alignment) == 0);
+  }
+
+  /* Also give back spare room at the end */
+  if (!chunk_is_mmapped(p)) {
+    size = chunksize(p);
+    if ((unsigned long)(size) > (unsigned long)(nb + MINSIZE)) {
+      remainder_size = size - nb;
+      remainder = chunk_at_offset(p, nb);
+      set_head(remainder, remainder_size | PREV_INUSE );
+      set_head_size(p, nb);
+      set_arena_for_chunk(remainder, av);
+      _int_free(av, chunk2mem(remainder));
+    }
+  }
+
+  set_arena_for_chunk(p, av);
+  check_inuse_chunk(av, p);
+  return chunk2mem(p);
+}
+
+#if 1
+/*
+  ------------------------------ calloc ------------------------------
+*/
+
+#if __STD_C
+Void_t* cALLOc(cvmx_arena_list_t arena_list, size_t n_elements, size_t elem_size)
+#else
+Void_t* cALLOc(n_elements, elem_size) size_t n_elements; size_t elem_size;
+#endif
+{
+  mchunkptr p;
+  unsigned long clearsize;
+  unsigned long nclears;
+  INTERNAL_SIZE_T* d;
+
+  Void_t* mem = public_mALLOc(arena_list, n_elements * elem_size);
+
+  if (mem != 0) {
+    p = mem2chunk(mem);
+
+    {
+      /*
+        Unroll clear of <= 36 bytes (72 if 8byte sizes)
+        We know that contents have an odd number of
+        INTERNAL_SIZE_T-sized words; minimally 3.
+      */
+
+      d = (INTERNAL_SIZE_T*)mem;
+      clearsize = chunksize(p) - SIZE_SZ;
+      nclears = clearsize / sizeof(INTERNAL_SIZE_T);
+      assert(nclears >= 3);
+
+      if (nclears > 9)
+        MALLOC_ZERO(d, clearsize);
+
+      else {
+        *(d+0) = 0;
+        *(d+1) = 0;
+        *(d+2) = 0;
+        if (nclears > 4) {
+          *(d+3) = 0;
+          *(d+4) = 0;
+          if (nclears > 6) {
+            *(d+5) = 0;
+            *(d+6) = 0;
+            if (nclears > 8) {
+              *(d+7) = 0;
+              *(d+8) = 0;
+            }
+          }
+        }
+      }
+    }
+  }
+  return mem;
+}
+#endif
+
+
+/*
+  ------------------------- malloc_usable_size -------------------------
+*/
+
+#if __STD_C
+size_t mUSABLe(Void_t* mem)
+#else
+size_t mUSABLe(mem) Void_t* mem;
+#endif
+{
+  mchunkptr p;
+  if (mem != 0) {
+    p = mem2chunk(mem);
+    if (chunk_is_mmapped(p))
+      return chunksize(p) - 3*SIZE_SZ; /* updated size for adding arena_ptr */
+    else if (inuse(p))
+      return chunksize(p) - 2*SIZE_SZ; /* updated size for adding arena_ptr */
+  }
+  return 0;
+}
+
+/*
+  ------------------------------ mallinfo ------------------------------
+*/
+
+struct mallinfo mALLINFo(mstate av)
+{
+  struct mallinfo mi;
+  int i;
+  mbinptr b;
+  mchunkptr p;
+  INTERNAL_SIZE_T avail;
+  INTERNAL_SIZE_T fastavail;
+  int nblocks;
+  int nfastblocks;
+
+  /* Ensure initialization */
+  if (av->top == 0)  malloc_consolidate(av);
+
+  check_malloc_state(av);
+
+  /* Account for top */
+  avail = chunksize(av->top);
+  nblocks = 1;  /* top always exists */
+
+  /* traverse fastbins */
+  nfastblocks = 0;
+  fastavail = 0;
+
+  for (i = 0; i < NFASTBINS; ++i) {
+    for (p = av->fastbins[i]; p != 0; p = p->fd) {
+      ++nfastblocks;
+      fastavail += chunksize(p);
+    }
+  }
+
+  avail += fastavail;
+
+  /* traverse regular bins */
+  for (i = 1; i < NBINS; ++i) {
+    b = bin_at(av, i);
+    for (p = last(b); p != b; p = p->bk) {
+      ++nblocks;
+      avail += chunksize(p);
+    }
+  }
+
+  mi.smblks = nfastblocks;
+  mi.ordblks = nblocks;
+  mi.fordblks = avail;
+  mi.uordblks = av->system_mem - avail;
+  mi.arena = av->system_mem;
+  mi.fsmblks = fastavail;
+  mi.keepcost = chunksize(av->top);
+  return mi;
+}
+
+/*
+  ------------------------------ malloc_stats ------------------------------
+*/
+
+void mSTATs()
+{
+}
+
+
+/*
+  ------------------------------ mallopt ------------------------------
+*/
+
+#if 0
+#if __STD_C
+int mALLOPt(int param_number, int value)
+#else
+int mALLOPt(param_number, value) int param_number; int value;
+#endif
+{
+}
+#endif
+
+
+/*
+  -------------------- Alternative MORECORE functions --------------------
+*/
+
+
+/*
+  General Requirements for MORECORE.
+
+  The MORECORE function must have the following properties:
+
+  If MORECORE_CONTIGUOUS is false:
+
+    * MORECORE must allocate in multiples of pagesize. It will
+      only be called with arguments that are multiples of pagesize.
+
+    * MORECORE(0) must return an address that is at least
+      MALLOC_ALIGNMENT aligned. (Page-aligning always suffices.)
+
+  else (i.e. If MORECORE_CONTIGUOUS is true):
+
+    * Consecutive calls to MORECORE with positive arguments
+      return increasing addresses, indicating that space has been
+      contiguously extended.
+
+    * MORECORE need not allocate in multiples of pagesize.
+      Calls to MORECORE need not have args of multiples of pagesize.
+
+    * MORECORE need not page-align.
+
+  In either case:
+
+    * MORECORE may allocate more memory than requested. (Or even less,
+      but this will generally result in a malloc failure.)
+
+    * MORECORE must not allocate memory when given argument zero, but
+      instead return one past the end address of memory from previous
+      nonzero call. This malloc does NOT call MORECORE(0)
+      until at least one call with positive arguments is made, so
+      the initial value returned is not important.
+
+    * Even though consecutive calls to MORECORE need not return contiguous
+      addresses, it must be OK for malloc'ed chunks to span multiple
+      regions in those cases where they do happen to be contiguous.
+
+    * MORECORE need not handle negative arguments -- it may instead
+      just return MORECORE_FAILURE when given negative arguments.
+      Negative arguments are always multiples of pagesize. MORECORE
+      must not misinterpret negative args as large positive unsigned
+      args. You can suppress all such calls from even occurring by defining
+      MORECORE_CANNOT_TRIM,
+
+  There is some variation across systems about the type of the
+  argument to sbrk/MORECORE. If size_t is unsigned, then it cannot
+  actually be size_t, because sbrk supports negative args, so it is
+  normally the signed type of the same width as size_t (sometimes
+  declared as "intptr_t", and sometimes "ptrdiff_t").  It doesn't much
+  matter though. Internally, we use "long" as arguments, which should
+  work across all reasonable possibilities.
+
+  Additionally, if MORECORE ever returns failure for a positive
+  request, and HAVE_MMAP is true, then mmap is used as a noncontiguous
+  system allocator. This is a useful backup strategy for systems with
+  holes in address spaces -- in this case sbrk cannot contiguously
+  expand the heap, but mmap may be able to map noncontiguous space.
+
+  If you'd like mmap to ALWAYS be used, you can define MORECORE to be
+  a function that always returns MORECORE_FAILURE.
+
+  If you are using this malloc with something other than sbrk (or its
+  emulation) to supply memory regions, you probably want to set
+  MORECORE_CONTIGUOUS as false.  As an example, here is a custom
+  allocator kindly contributed for pre-OSX macOS.  It uses virtually
+  but not necessarily physically contiguous non-paged memory (locked
+  in, present and won't get swapped out).  You can use it by
+  uncommenting this section, adding some #includes, and setting up the
+  appropriate defines above:
+
+      #define MORECORE osMoreCore
+      #define MORECORE_CONTIGUOUS 0
+
+  There is also a shutdown routine that should somehow be called for
+  cleanup upon program exit.
+
+  #define MAX_POOL_ENTRIES 100
+  #define MINIMUM_MORECORE_SIZE  (64 * 1024)
+  static int next_os_pool;
+  void *our_os_pools[MAX_POOL_ENTRIES];
+
+  void *osMoreCore(int size)
+  {
+    void *ptr = 0;
+    static void *sbrk_top = 0;
+
+    if (size > 0)
+    {
+      if (size < MINIMUM_MORECORE_SIZE)
+         size = MINIMUM_MORECORE_SIZE;
+      if (CurrentExecutionLevel() == kTaskLevel)
+         ptr = PoolAllocateResident(size + RM_PAGE_SIZE, 0);
+      if (ptr == 0)
+      {
+        return (void *) MORECORE_FAILURE;
+      }
+      // save ptrs so they can be freed during cleanup
+      our_os_pools[next_os_pool] = ptr;
+      next_os_pool++;
+      ptr = (void *) ((((unsigned long) ptr) + RM_PAGE_MASK) & ~RM_PAGE_MASK);
+      sbrk_top = (char *) ptr + size;
+      return ptr;
+    }
+    else if (size < 0)
+    {
+      // we don't currently support shrink behavior
+      return (void *) MORECORE_FAILURE;
+    }
+    else
+    {
+      return sbrk_top;
+    }
+  }
+
+  // cleanup any allocated memory pools
+  // called as last thing before shutting down driver
+
+  void osCleanupMem(void)
+  {
+    void **ptr;
+
+    for (ptr = our_os_pools; ptr < &our_os_pools[MAX_POOL_ENTRIES]; ptr++)
+      if (*ptr)
+      {
+         PoolDeallocate(*ptr);
+         *ptr = 0;
+      }
+  }
+
+*/
+
+
+
+/* ------------------------------------------------------------
+History:
+
+[see ftp://g.oswego.edu/pub/misc/malloc.c for the history of dlmalloc]
+
+*/
diff --git a/cvmx-malloc/malloc.h b/cvmx-malloc/malloc.h
new file mode 100644
index 000000000000..6d6f6343f790
--- /dev/null
+++ b/cvmx-malloc/malloc.h
@@ -0,0 +1,213 @@
+/*
+Copyright (c) 2001 Wolfram Gloger
+Copyright (c) 2006 Cavium networks
+
+Permission to use, copy, modify, distribute, and sell this software
+and its documentation for any purpose is hereby granted without fee,
+provided that (i) the above copyright notices and this permission
+notice appear in all copies of the software and related documentation,
+and (ii) the name of Wolfram Gloger may not be used in any advertising
+or publicity relating to the software.
+
+THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND,
+EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY
+WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+
+IN NO EVENT SHALL WOLFRAM GLOGER BE LIABLE FOR ANY SPECIAL,
+INCIDENTAL, INDIRECT OR CONSEQUENTIAL DAMAGES OF ANY KIND, OR ANY
+DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY
+OF LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+PERFORMANCE OF THIS SOFTWARE.
+*/
+
+#ifndef _MALLOC_H
+#define _MALLOC_H 1
+
+#undef _LIBC
+#ifdef _LIBC
+#include <features.h>
+#endif
+
+/*
+  $Id: malloc.h 30481 2007-12-05 21:46:59Z rfranz $
+  `ptmalloc2', a malloc implementation for multiple threads without
+  lock contention, by Wolfram Gloger <wg@malloc.de>.
+
+  VERSION 2.7.0
+
+  This work is mainly derived from malloc-2.7.0 by Doug Lea
+  <dl@cs.oswego.edu>, which is available from:
+
+                 ftp://gee.cs.oswego.edu/pub/misc/malloc.c
+
+  This trimmed-down header file only provides function prototypes and
+  the exported data structures.  For more detailed function
+  descriptions and compile-time options, see the source file
+  `malloc.c'.
+*/
+
+#if 0
+# include <stddef.h>
+# define __malloc_ptr_t  void *
+# undef  size_t
+# define size_t          unsigned long
+# undef  ptrdiff_t
+# define ptrdiff_t       long
+#else
+# undef  Void_t
+# define Void_t       void
+# define __malloc_ptr_t  char *
+#endif
+
+#ifdef _LIBC
+/* Used by GNU libc internals. */
+# define __malloc_size_t size_t
+# define __malloc_ptrdiff_t ptrdiff_t
+#elif !defined __attribute_malloc__
+# define __attribute_malloc__
+#endif
+
+#ifdef __GNUC__
+
+/* GCC can always grok prototypes.  For C++ programs we add throw()
+   to help it optimize the function calls.  But this works only with
+   gcc 2.8.x and egcs.  */
+# if defined __cplusplus && (__GNUC__ >= 3 || __GNUC_MINOR__ >= 8)
+#  define __THROW	throw ()
+# else
+#  define __THROW
+# endif
+# define __MALLOC_P(args)	args __THROW
+/* This macro will be used for functions which might take C++ callback
+   functions.  */
+# define __MALLOC_PMT(args)	args
+
+#else	/* Not GCC.  */
+
+# define __THROW
+
+# if (defined __STDC__ && __STDC__) || defined __cplusplus
+
+#  define __MALLOC_P(args)	args
+#  define __MALLOC_PMT(args)	args
+
+# else	/* Not ANSI C or C++.  */
+
+#  define __MALLOC_P(args)	()	/* No prototypes.  */
+#  define __MALLOC_PMT(args)	()
+
+# endif	/* ANSI C or C++.  */
+
+#endif	/* GCC.  */
+
+#ifndef NULL
+# ifdef __cplusplus
+#  define NULL	0
+# else
+#  define NULL	((__malloc_ptr_t) 0)
+# endif
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Nonzero if the malloc is already initialized.  */
+#ifdef _LIBC
+/* In the GNU libc we rename the global variable
+   `__malloc_initialized' to `__libc_malloc_initialized'.  */
+# define __malloc_initialized __libc_malloc_initialized
+#endif
+extern int cvmx__malloc_initialized;
+
+
+/* SVID2/XPG mallinfo structure */
+
+struct mallinfo {
+  int arena;    /* non-mmapped space allocated from system */
+  int ordblks;  /* number of free chunks */
+  int smblks;   /* number of fastbin blocks */
+  int hblks;    /* number of mmapped regions */
+  int hblkhd;   /* space in mmapped regions */
+  int usmblks;  /* maximum total allocated space */
+  int fsmblks;  /* space available in freed fastbin blocks */
+  int uordblks; /* total allocated space */
+  int fordblks; /* total free space */
+  int keepcost; /* top-most, releasable (via malloc_trim) space */
+};
+
+/* Returns a copy of the updated current mallinfo. */
+extern struct mallinfo mallinfo __MALLOC_P ((void));
+
+/* SVID2/XPG mallopt options */
+#ifndef M_MXFAST
+# define M_MXFAST  1	/* maximum request size for "fastbins" */
+#endif
+#ifndef M_NLBLKS
+# define M_NLBLKS  2	/* UNUSED in this malloc */
+#endif
+#ifndef M_GRAIN
+# define M_GRAIN   3	/* UNUSED in this malloc */
+#endif
+#ifndef M_KEEP
+# define M_KEEP    4	/* UNUSED in this malloc */
+#endif
+
+/* mallopt options that actually do something */
+#define M_TRIM_THRESHOLD    -1
+#define M_TOP_PAD           -2
+#define M_MMAP_THRESHOLD    -3
+#define M_MMAP_MAX          -4
+#define M_CHECK_ACTION      -5
+
+/* General SVID/XPG interface to tunable parameters. */
+extern int mallopt __MALLOC_P ((int __param, int __val));
+
+/* Release all but __pad bytes of freed top-most memory back to the
+   system. Return 1 if successful, else 0. */
+extern int malloc_trim __MALLOC_P ((size_t __pad));
+
+/* Report the number of usable allocated bytes associated with allocated
+   chunk __ptr. */
+extern size_t malloc_usable_size __MALLOC_P ((__malloc_ptr_t __ptr));
+
+/* Prints brief summary statistics on stderr. */
+extern void malloc_stats __MALLOC_P ((void));
+
+/* Record the state of all malloc variables in an opaque data structure. */
+extern __malloc_ptr_t malloc_get_state __MALLOC_P ((void));
+
+/* Restore the state of all malloc variables from data obtained with
+   malloc_get_state(). */
+extern int malloc_set_state __MALLOC_P ((__malloc_ptr_t __ptr));
+
+/* Called once when malloc is initialized; redefining this variable in
+   the application provides the preferred way to set up the hook
+   pointers. */
+extern void (*cmvx__malloc_initialize_hook) __MALLOC_PMT ((void));
+/* Hooks for debugging and user-defined versions. */
+extern void (*cvmx__free_hook) __MALLOC_PMT ((__malloc_ptr_t __ptr,
+					__const __malloc_ptr_t));
+extern __malloc_ptr_t (*cvmx__malloc_hook) __MALLOC_PMT ((size_t __size,
+						    __const __malloc_ptr_t));
+extern __malloc_ptr_t (*cvmx__realloc_hook) __MALLOC_PMT ((__malloc_ptr_t __ptr,
+						     size_t __size,
+						     __const __malloc_ptr_t));
+extern __malloc_ptr_t (*cvmx__memalign_hook) __MALLOC_PMT ((size_t __alignment,
+						      size_t __size,
+						      __const __malloc_ptr_t));
+extern void (*__after_morecore_hook) __MALLOC_PMT ((void));
+
+/* Activate a standard set of debugging hooks. */
+extern void cvmx__malloc_check_init __MALLOC_P ((void));
+
+/* Internal routines, operating on "arenas".  */
+struct malloc_state;
+typedef struct malloc_state *mstate;
+#ifdef __cplusplus
+}; /* end of extern "C" */
+#endif
+
+
+#endif /* malloc.h */
diff --git a/cvmx-malloc/thread-m.h b/cvmx-malloc/thread-m.h
new file mode 100644
index 000000000000..de9ba6c6553d
--- /dev/null
+++ b/cvmx-malloc/thread-m.h
@@ -0,0 +1,73 @@
+/*
+Copyright (c) 2001 Wolfram Gloger
+Copyright (c) 2006 Cavium networks
+
+Permission to use, copy, modify, distribute, and sell this software
+and its documentation for any purpose is hereby granted without fee,
+provided that (i) the above copyright notices and this permission
+notice appear in all copies of the software and related documentation,
+and (ii) the name of Wolfram Gloger may not be used in any advertising
+or publicity relating to the software.
+
+THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND,
+EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY
+WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+
+IN NO EVENT SHALL WOLFRAM GLOGER BE LIABLE FOR ANY SPECIAL,
+INCIDENTAL, INDIRECT OR CONSEQUENTIAL DAMAGES OF ANY KIND, OR ANY
+DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY
+OF LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+PERFORMANCE OF THIS SOFTWARE.
+*/
+
+/* $Id: thread-m.h 30481 2007-12-05 21:46:59Z rfranz $
+   One out of _LIBC, USE_PTHREADS, USE_THR or USE_SPROC should be
+   defined, otherwise the token NO_THREADS and dummy implementations
+   of the macros will be defined.  */
+
+#ifndef _THREAD_M_H
+#define _THREAD_M_H
+
+#undef thread_atfork_static
+
+
+#undef NO_THREADS /* No threads, provide dummy macros */
+
+typedef int thread_id;
+
+/* The mutex functions used to do absolutely nothing, i.e. lock,
+   trylock and unlock would always just return 0.  However, even
+   without any concurrently active threads, a mutex can be used
+   legitimately as an `in use' flag.  To make the code that is
+   protected by a mutex async-signal safe, these macros would have to
+   be based on atomic test-and-set operations, for example. */
+#ifdef __OCTEON__
+typedef cvmx_spinlock_t mutex_t;
+#define MUTEX_INITIALIZER          CMVX_SPINLOCK_UNLOCKED_VAL
+#define mutex_init(m)              cvmx_spinlock_init(m)
+#define mutex_lock(m)              cvmx_spinlock_lock(m)
+#define mutex_trylock(m)           (cvmx_spinlock_trylock(m))
+#define mutex_unlock(m)            cvmx_spinlock_unlock(m)
+#else
+
+typedef int mutex_t;
+
+#define MUTEX_INITIALIZER          0
+#define mutex_init(m)              (*(m) = 0)
+#define mutex_lock(m)              ((*(m) = 1), 0)
+#define mutex_trylock(m)           (*(m) ? 1 : ((*(m) = 1), 0))
+#define mutex_unlock(m)            (*(m) = 0)
+#endif
+
+
+
+typedef void *tsd_key_t;
+#define tsd_key_create(key, destr) do {} while(0)
+#define tsd_setspecific(key, data) ((key) = (data))
+#define tsd_getspecific(key, vptr) (vptr = (key))
+
+#define thread_atfork(prepare, parent, child) do {} while(0)
+
+
+#endif /* !defined(_THREAD_M_H) */