--- /dev/null
+/*-------------------------------------------------------------------------
+ *
+ * mspan.c
+ * Memory span management.
+ *
+ * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/utils/mmgr/mspan.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "utils/mspan.h"
+
+/*
+ * PostgreSQL normally uses 8kB pages for most things, but many common
+ * architecture/operating system pairings use a 4kB page size for memory
+ * allocation, so we do that here also. We assume that a large allocation
+ * is likely to begin on a page boundary; if not, we'll discard bytes from
+ * the beginning and end of the object and use only the middle portion that
+ * is properly aligned. This works, but is not ideal, so it's best to keep
+ * this conservatively small. There don't seem to be any common architectures
+ * where the page size is less than 4kB, so this should be good enough; also,
+ * the smaller we make it, the bigger the page map will be.
+ */
+#define MSPAN_PAGE_BITS 12
+#define MSPAN_PAGE_SIZE (1 << MSPAN_PAGE_BITS)
+#define MSPAN_PAGE_MASK (MSPAN_PAGE_SIZE - 1)
+
+/* Maximum number of pages for a 32-bit address space. */
+#define MSPAN_MAX_32BIT_PAGES (1 << (32 - MSPAN_PAGE_BITS))
+
+/*
+ * Small allocations are handled by dividing a relatively large chunk of
+ * memory into many small objects of equal size. The chunks of memory used
+ * for this purpose are called superblocks. Since chunks within a superblock
+ * can only be used to satisfy allocations of a single size class, it's
+ * important not to make the chunks too large. On the other hand, setting
+ * up a chunk has some overhead, so we don't want to make them too small,
+ * either.
+ */
+#define MSPAN_SMALL_OBJECT_SIZE_LIMIT 8192
+#define MSPAN_PAGES_PER_SUPERBLOCK 16
+#define MSPAN_SUPERBLOCK_SIZE \
+ (MSPAN_PAGES_PER_SUPERBLOCK * MSPAN_PAGE_SIZE)
+#define MSPAN_NUMBER_OF_SIZE_CLASSSES 36
+#define MSPAN_NUMBER_OF_FULLNESS_CLASSES 4
+#define MSPAN_SMALL_ALLOCATION_LISTS \
+ (MSPAN_NUMBER_OF_SIZE_CLASSSES * MSPAN_NUMBER_OF_FULLNESS_CLASSES)
+
+/*
+ * Management information for a span of memory.
+ */
+struct mspan
+{
+ relptr(void) parent; /* Context if used, manager if free. */
+ relptr(mspan) prevspan; /* Previous span. */
+ relptr(mspan) nextspan; /* Next span. */
+ Size first_page; /* Starting page number. */
+ Size npages; /* Length of span in pages. */
+ uint16 span_type; /* Type of span. */
+ uint16 ninitialized; /* Maximum number of objects ever allocated. */
+ uint16 nused; /* Number of objects currently allocated. */
+ uint16 firstfree; /* First object on free list. */
+};
+
+/*
+ * Management information for an allocation context.
+ */
+struct mspan_context
+{
+ relptr(mspan) large_allocation;
+ relptr(mspan) small_allocation[MSPAN_SMALL_ALLOCATION_LISTS];
+};
+
+/* Helper functions. */
+static mspan_context *mspan_allocate_context_descriptor(char *base,
+ mspan_manager *mgr);
+static mspan *mspan_find_free_span(char *base, mspan_manager *mgr,
+ Size minpages, Size maxpages);
+static void mspan_update_page_map(char *base, mspan_manager *mgr,
+ Size first_page, Size npages, Size value);
+
+/*
+ * Initialize backend-private mspan_manager.
+ *
+ * We must be prepared to manage memory anywhere in the process address
+ * space.
+ */
+void
+mspan_initialize_private_manager(mspan_manager *mgr)
+{
+ unsigned bits = SIZEOF_SIZE_T * BITS_PER_BYTE;
+
+ memset(mgr, 0, sizeof(mspan_manager));
+
+ aspace_map_initialize(&mgr->page_map,
+ 1 << (bits - MSPAN_PAGE_BITS),
+ bits <= 32 ? ASPACE_MAP_32BIT_VALUES : 0);
+}
+
+/*
+ * Initialize dynamic shared memory mspan_manager.
+ *
+ * We need only be prepared to manage the specified number of bytes.
+ */
+mspan_manager *
+mspan_initialize_dsm_manager(dsm_segment *seg, void *start, Size nbytes)
+{
+ char *segbase = dsm_segment_address(seg);
+ Size segsize = dsm_segment_map_length(seg);
+ char *astart = start;
+ char *aend = astart + nbytes;
+ mspan_manager *mgr;
+
+ /* Arena to be managed must be within the segment. */
+ Assert(astart >= segbase && astart + nbytes <= segbase + segsize);
+
+ /* Arena to be managed must not be smaller than the metadata. */
+ Assert(nbytes >= sizeof(mspan_manager));
+
+ /* Allocate and zero space for the manager. */
+ mgr = (mspan_manager *) astart;
+ astart += sizeof(mspan_manager);
+ memset(mgr, 0, sizeof(mspan_manager));
+
+ /* Initialize those fields that require it. */
+ mgr->base = astart - segbase;
+ if ((mgr->base & MSPAN_PAGE_MASK) != 0)
+ mgr->base = (mgr->base & MSPAN_PAGE_MASK) + MSPAN_PAGE_SIZE;
+ mgr->npages = ((aend - segbase) - mgr->base) >> MSPAN_PAGE_BITS;
+ Assert(mgr->npages > 0);
+ aspace_map_initialize(&mgr->page_map, mgr->npages,
+ mgr->npages <= MSPAN_MAX_32BIT_PAGES ?
+ ASPACE_MAP_32BIT_VALUES : 0);
+
+ return mgr;
+}
+
+/*
+ * Create a new allocation context within an address space.
+ */
+mspan_context *
+mspan_context_create(dsm_segment *seg, mspan_manager *mgr)
+{
+ char *base = (seg != NULL ? dsm_segment_address(seg) : NULL);
+ mspan_context *cxt;
+
+ if (relptr_is_null(mgr->freecontext))
+ cxt = mspan_allocate_context_descriptor(base, mgr);
+ else
+ {
+ /* Pop a previously-allocated context from the free list. */
+ cxt = relptr_access(base, mgr->freecontext);
+ mgr->freecontext.relptr_off = * (Size *) cxt;
+ }
+
+ /* All lists of allocations begin empty. */
+ memset(cxt, 0, sizeof(mspan_context));
+
+ return cxt;
+}
+
+/*
+ * Allocate new space for a new context descriptor.
+ *
+ * We expect the number of contexts to remain small. Therefore, when
+ * allocating backend-local memory, we allocate them one at a time from the
+ * OS; and when allocating from dynamic shared memory, we allocate space for
+ * them one page at a time, rather than (for example) a full superblock.
+ *
+ * Context descriptors are never freed; instead, when the user destroys a
+ * context, we just push the context descriptor onto a free list. Because
+ * of this, we don't need a span describing the space set aside for context
+ * descriptors, or page map entries pointing to it. This helps us avoid
+ * circular dependencies inside the allocator.
+ */
+static mspan_context *
+mspan_allocate_context_descriptor(char *base, mspan_manager *mgr)
+{
+ mspan_context *cxt;
+ mspan *span;
+ Size pageno;
+
+ /* Outside of a dynamic shared memory segment, just allocate from OS. */
+ if (base == NULL)
+ {
+ cxt = malloc(sizeof(mspan_context));
+ if (cxt == NULL)
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ return cxt;
+ }
+
+ /*
+ * We must allocate from within the segment, so can't fall back on malloc.
+ * It's desirable to avoid fragmenting spans that are large enough to
+ * contain a superblock, but smaller spans are not as useful, so they're
+ * a good way to satisfy our single-page request. Therefore, we first
+ * look for a small span, then try to allocate from the boundary, then
+ * finally look for a large span. If none of that works, we're out out
+ * of memory.
+ */
+ span = mspan_find_free_span(base, mgr, 1, MSPAN_PAGES_PER_SUPERBLOCK - 1);
+ if (span != NULL)
+ pageno = span->first_page;
+ else
+ {
+ if (mgr->boundary < mgr->npages)
+ pageno = ++mgr->boundary;
+ else
+ {
+ span = mspan_find_free_span(base, mgr,
+ MSPAN_PAGES_PER_SUPERBLOCK, 0);
+ if (span == NULL)
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of shared memory")));
+ pageno = span->first_page;
+ }
+ }
+
+ /* Remove the page map entry for the start of the span. */
+ mspan_update_page_map(base, mgr, pageno, 1, 0);
+
+ /*
+ * XXX. If the span is exactly one page long, we want to free it.
+ * Presumably this means the mspace_manager (not the context) must
+ * manage the list of free spans. Alternatively, we could leak it,
+ * which is no worse than what would happen for a garden-variety
+ * dsm-lifetime span allocation.
+ *
+ * If the span is more than one page long, we want to increment the
+ * first page, decrement the page count, and make a page map entry
+ * for whatever's left over. Technically the last doesn't matter,
+ * since the adjacent "span" is one which can never be freed, but
+ * let's do it anyway for the sake of tidiness.
+ */
+
+ /*
+ * XXX. Once we've worked things out on a span level, we need to carve
+ * up the page, put all but one item on the manager-level context free
+ * list, and return the last item.
+ */
+}
+
+/*
+ * Find a previously-allocated span that is now available for reuse.
+ */
+static mspan *
+mspan_find_free_span(char *base, mspan_manager *mgr, Size minpages,
+ Size maxpages)
+{
+ Size start_exact_search;
+ Size stop_exact_search;
+ Size i;
+
+ Assert(minpages > 0);
+
+ /*
+ * Every free list except the last holds spans of one particular size;
+ * if any relevant list is non-empty, we can just return the first item.
+ */
+ start_exact_search = Min(minpages, MSPAN_NUM_FREE_LISTS) - 1;
+ stop_exact_search = maxpages == 0 || maxpages > MSPAN_NUM_FREE_LISTS - 1 ?
+ MSPAN_NUM_FREE_LISTS - 1 : maxpages;
+ for (i = start_exact_search; i < stop_exact_search; ++i)
+ if (!relptr_is_null(mgr->freelist[i]))
+ return relptr_access(base, mgr->freelist[i]);
+
+ /* The very last free list holds all of the remaining objects. */
+ if (maxpages == 0 || maxpages > MSPAN_NUM_FREE_LISTS - 1)
+ {
+ mspan *span;
+ mspan *best = NULL;
+
+ span = relptr_access(base, mgr->freelist[MSPAN_NUM_FREE_LISTS - 1]);
+ while (span != NULL)
+ {
+ if (span->npages >= minpages &&
+ (best == NULL || span->npages < best->npages))
+ best = span;
+ span = relptr_access(base, span->nextspan);
+ }
+ return span;
+ }
+
+ return NULL;
+}
+
+/*
+ * Update the page map.
+ */
+static void
+mspan_update_page_map(char *base, mspan_manager *mgr, Size first_page,
+ Size npages, Size value)
+{
+ aspace_map_handle h;
+
+ h.as_map = &mgr->page_map;
+ h.as_base = base;
+ h.as_allocator = NULL; /* XXX FIXME */
+ h.as_allocator_private = NULL; /* XXX FIXME */
+
+ aspace_map_set_range(&h, first_page, npages, value);
+}
+
+/*
+ * Remove a span from the circularly-linked list that presently contains it.
+ */
+static void
+mspan_unlink_span(char *base, mspan *span)
+{
+ mspan *next;
+ mspan *prev;
+ mspan *null = NULL;
+
+ next = relptr_access(base, span->nextspan);
+ prev = relptr_access(base, span->prevspan);
+ Assert(next != NULL && prev != NULL);
+ next->prevspan.relptr_off = span->prevspan.relptr_off;
+ prev->nextspan.relptr_off = span->nextspan.relptr_off;
+#ifdef USE_ASSERT_CHECKING
+ relptr_store(base, span->prevspan, null);
+ relptr_store(base, span->nextspan, null);
+#endif
+}
+
+static mspan *mspan_allocate_span_descriptor();
+static mspan *mspan_allocate_span();
+static void mspan_free_span(mspan *);
+
+static void mspan_init_superblock(mspan *);
+static void *mspan_allocate_from_superblock(mspan_context *);
+static void mspan_free_to_superblock();
--- /dev/null
+/*-------------------------------------------------------------------------
+ *
+ * aspace_map.h
+ * Address space map.
+ *
+ * An aspace_map is a lookup table where the keys and values are 64-bit
+ * integers. The key limit (one more than the highest allowable key)
+ * must be specified at the time the map is created. The value may be any
+ * 64-bit integer unless ASPACE_MAP_32BIT_VALUES is specified, in which
+ * case it must be less than 2^32. Attempting to look up a key not
+ * entered into the table will return 0.
+ *
+ * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/utils/aspace_map.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef ASPACE_MAP_H
+#define ASPACE_MAP_H
+
+#include "storage/dsm.h"
+
+/*
+ * Flags that can be passed to aspace_map_create().
+ *
+ * If ASPACE_MAP_DIRECT is specified, the map will be stored as a single
+ * array whose size is equal to the key limit. ASPACE_MAP_INDIRECT will
+ * use a two-level radix tree, with exactly 18 bits handled at the bottom
+ * level. ASPACE_MAP_DOUBLE_INDIRECT will use a three-level radix tree,
+ * with 18 bits handled in each of the bottom two levels and the remainder
+ * handled at the top level. At most one of these flags can be specified.
+ * If none of these flags is given, an appropriate value will be chosen
+ * based on the key limit.
+ *
+ * If ASPACE_MAP_32BIT_VALUES is used, all values later stored in the map
+ * must be less than 2^32. This reduces memory consumption by about 50%.
+ */
+#define ASPACE_MAP_DIRECT 0x0001
+#define ASPACE_MAP_INDIRECT 0x0002
+#define ASPACE_MAP_DOUBLE_INDIRECT 0x0003
+#define ASPACE_MAP_32BIT_VALUES 0x0004
+
+/*
+ * Since we expected entries to be clustered within the available keyspace,
+ * it makes sense to cache a few entries at each level for quick access.
+ * This also helps to keep the map small; if the total number of entries at
+ * any given level fits within the cache, we don't need a real array for
+ * that level.
+ */
+#define ASPACE_CACHESIZE_DOUBLE_INDIRECT 4
+#define ASPACE_CACHESIZE_INDIRECT 4
+#define ASPACE_CACHESIZE_DIRECT 16
+
+typedef struct
+{
+ uint64 key;
+ uint64 value; /* If indirect, (relative) pointer; else value. */
+} aspace_pair;
+
+typedef struct
+{
+ unsigned flags;
+ uint64 maxkey;
+ uint64 map; /* Pointer or relative pointer. */
+ aspace_pair direct_cache[ASPACE_CACHESIZE_DIRECT];
+ aspace_pair indirect_cache[ASPACE_CACHESIZE_INDIRECT];
+ aspace_pair double_indirect_cache[ASPACE_CACHESIZE_DOUBLE_INDIRECT];
+} aspace_map;
+
+typedef void *(*aspace_map_allocator)(void *private, Size);
+
+/*
+ * An aspace_map might be located within a dynamic shared memory segment,
+ * so we need to separate the shared state from the backend-private state.
+ * There's no special API for initializing an aspace_map_handle; callers
+ * are expected to construct a suitable object by filling in the necessary
+ * fields.
+ */
+typedef struct
+{
+ aspace_map *as_map;
+ char *as_base;
+ aspace_map_allocator as_allocator;
+ void *as_allocator_private;
+} aspace_map_handle;
+
+/* API functions. */
+extern void aspace_map_initialize(aspace_map *, uint64 key_limit, int flags);
+extern void aspace_map_set(aspace_map_handle *map, uint64 key, uint64 value);
+extern void aspace_map_set_range(aspace_map_handle *map,
+ uint64 first_key, uint64 nkeys, uint64 value);
+extern uint64 aspace_map_get(aspace_map_handle *map, uint64 key);
+
+#endif
--- /dev/null
+/*-------------------------------------------------------------------------
+ *
+ * mspan.h
+ * Memory span management.
+ *
+ * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/utils/mspan.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef MSPAN_H
+#define MSPAN_H
+
+#include "utils/aspace_map.h"
+
+/*
+ * Relative pointers.
+ *
+ * These are intended to be used when storing an address that may be
+ * relative either to the base of the processes address space or some
+ * dynamic shared memory segment mapped therein.
+ *
+ * The idea here is that you declare a relative pointer as relptr(type)
+ * and then use relptr_access to dereference it and relptr_store to change
+ * it. The use of a union here is a hack, because what's stored in the
+ * relptr is always a Size, never an actual pointer. But including a pointer
+ * in the union allows us to use stupid macro tricks to provide some measure
+ * of type-safety.
+ */
+#define relptr(type) union { type *relptr_type; Size relptr_off; }
+#define relptr_access(base, rp) \
+ (AssertVariableIsOfTypeMacro(base, char *), \
+ (__typeof__((rp).relptr_type)) ((rp).relptr_off == 0 ? NULL : \
+ (base + (rp).relptr_off)))
+#define relptr_is_null(rp) \
+ ((rp).relptr_off == 0)
+#define relptr_store(base, rp, val) \
+ (AssertVariableIsOfTypeMacro(base, char *), \
+ AssertVariableIsOfTypeMacro(val, __typeof__((rp).relptr_type)), \
+ (rp).relptr_off = ((val) == NULL ? 0 : ((char *) (val)) - (base)))
+
+/*
+ * Flags that can be associated with memory allocations.
+ */
+#define MSPAN_ALLOW_HUGE 0x0001 /* allow allocations > 1 GB */
+#define MSPAN_SOFT_FAIL 0x0002 /* return NULL on failure */
+
+/*
+ * Large objects - and the superblocks used to satisfy smaller allocations -
+ * are allocated from free lists. Each free list except the last holds
+ * available spans of one particular size; the final free list holds all
+ * the remaining ones.
+ */
+#define MSPAN_NUM_FREE_LISTS 256
+
+/* Forward declarations. */
+struct mspan;
+struct mspan_context;
+struct mspan_manager;
+typedef struct mspan mspan;
+typedef struct mspan_context mspan_context;
+typedef struct mspan_manager mspan_manager;
+
+/*
+ * One mspan_manager is needed for each allocation space. This means that
+ * we have one for our own address space, which is stored in a static variable;
+ * and one for each dynamic shared memory segment in which we want to use
+ * this facility, which should be stored within that segment.
+ */
+struct mspan_manager
+{
+ Size npages; /* # of managed pages in dsm; 0 for private */
+ Size base; /* offset of page 0 within dsm; 0 for private */
+ Size boundary; /* first unallocated page in dsm; 0 for private */
+ unsigned ncontexts; /* # of outstanding contexts */
+ aspace_map page_map; /* map pages to mspans */
+ relptr(mspan_context) freecontext; /* allocatable context object */
+ relptr(mspan) freelist[MSPAN_NUM_FREE_LISTS]; /* spans for freespace */
+};
+
+/* Manager for backend-private address space. */
+extern mspan_manager private_mspan_manager;
+
+/* Initialization a manager in private or dynamic shared memory. */
+extern void mspan_initialize_private_manager(mspan_manager *);
+extern mspan_manager *mspan_initialize_dsm_manager(dsm_segment *,
+ void *, Size nbytes);
+
+/* Create or destroy a memory context. */
+extern mspan_context *mspan_context_create(dsm_segment *, mspan_manager *);
+extern void mspan_context_destroy(dsm_segment *, mspan_context *);
+
+/* Allocate or free memory. */
+extern void *mspan_alloc(dsm_segment *, mspan_context *, Size size, int flags);
+extern void *mspan_free(dsm_segment *, void *);
+
+#endif