I write in C for several projects (e.g. learning / teaching, coding competitions, data processing) and frequently need arrays (e.g. strings) with fast appending / concatenation / reversing.
I've isolated the code below from a project. For simplicity, I assume all arguments are valid, including no pointer aliasing (for each function, no two arguments point to overlapping blocks of memory). I use a doubling strategy on reallocation.
I understand that vector
and string
from C++ likely have superior performance / features - this is for settings where a simple C solution is more accessible or easier to modify for the specific context of the problem.
stringi.h
#include <assert.h> // contiguous dynamic array for fixed-size type T // note: v[] is not terminated (no '\0') typedef struct { char *v; // T *v (testing: T = char) size_t count; // count <= capacity size_t capacity; // sizeof(v) / sizeof(T) } stringi_raw; // safe non-method mutations: // 1. modifying v[i] for indices i // e.g. v[i] = 'a'; ++v[i]; // 2. if n <= count, assigning n to count will keep // the first n elements; beware of underflow! // e.g. while (count > 0) --count; // maximum capacity: capacity <= CAP_MAX // practical: CAP_MAX = SIZE_MAX / sizeof(T) static const size_t CAP_MAX = 987654321; // pointer to a stringi_raw typedef stringi_raw *stringi; // initializes a stringi with 0 elements // to be deallocated by stringi_free stringi stringi_init(); // frees memory used by target from the heap void stringi_free(stringi target); // allow n items to be stored without realloc // note: assumes n <= CAP_MAX void stringi_reserve(stringi target, size_t n); // appends c to the end of target // note: takes O(1) amortized time void stringi_put(stringi target, char c); // appends the contents of src to dst void stringi_cat(stringi dst, const stringi src); // reverses target->v void stringi_reverse(stringi target); // aka "reverse(src); cat(dst, src); reverse(src)" void stringi_cat_rev(stringi dst, const stringi src);
stringi.c
#include <stdlib.h> #include "stringi.h" stringi stringi_init() { stringi target = malloc(sizeof(stringi_raw)); assert(target != NULL); target->v = NULL; // realloc(NULL, x) = malloc(x) target->count = 0; target->capacity = 0; return target; } void stringi_free(stringi target) { free(target->v); // free(NULL) is safe free(target); } // sets the capacity of target to cap // note: assumes count <= cap <= CAP_MAX void stringi_set_cap(stringi target, size_t cap) { void *v = realloc(target->v, cap * sizeof(char)); assert(v != NULL); target->v = v; target->capacity = cap; } // get the next capacity from cap size_t double_or_max(size_t cap) { if (cap > CAP_MAX / 2) return CAP_MAX; return 2 * cap; } void stringi_reserve(stringi target, size_t n) { size_t cap = target->capacity; if (n > cap) { if (cap == 0) cap = 1; else { cap = double_or_max(cap); while (n > cap) cap = double_or_max(cap); } stringi_set_cap(target, cap); } } void stringi_put(stringi target, char c) { size_t n = target->count; assert(n < CAP_MAX); size_t n_fin = n + 1; stringi_reserve(target, n_fin); target->v[n] = c; target->count = n_fin; } void stringi_cat(stringi dst, const stringi src) { size_t n_d = dst->count; size_t n_s = src->count; assert(n_d <= CAP_MAX - n_s); size_t n_fin = n_d + n_s; stringi_reserve(dst, n_fin); for (size_t i = 0; i < n_s; ++i) dst->v[n_d + i] = src->v[i]; dst->count = n_fin; } // if min <= max, reverse v[min] to v[max] inclusive void chars_reverse(char *v, size_t min, size_t max) { while (min < max) { char temp = v[min]; v[min] = v[max]; v[max] = temp; ++min; --max; } } void stringi_reverse(stringi target) { // avoid underflow when count = 0 size_t count = target->count; if (count > 0) chars_reverse(target->v, 0, count - 1); } void stringi_cat_rev(stringi dst, const stringi src) { size_t n_d = dst->count; size_t n_s = src->count; assert(n_d <= CAP_MAX - n_s); size_t n_fin = n_d + n_s; stringi_reserve(dst, n_fin); for (size_t i = 0; i < n_s; ++i) dst->v[n_fin - 1 - i] = src->v[i]; dst->count = n_fin; }
Tests
#include <stdio.h> #include <limits.h> #include <string.h> #include <time.h> #include "stringi.h" void stringi_remove(stringi *target) { stringi_free(*target); *target = NULL; } void stringi_print(stringi target) { printf("'%.*s' (%zu/%zu)\n", (unsigned int) target->count, target->v, target->count, target->capacity); } void stringi_validate(stringi target, char *expected) { size_t n = strlen(expected); assert(n == target->count); assert(0 == strncmp(expected, target->v, n)); } int diff_to_ms(clock_t diff) { return diff * 1000 / CLOCKS_PER_SEC; } clock_t time_putchar(stringi target) { clock_t start = clock(); for (size_t i = 0; i < CAP_MAX; ++i) stringi_put(target, 'c'); return clock() - start; } int main() { assert(CAP_MAX >= 1); assert(CAP_MAX <= SIZE_MAX / sizeof(char)); stringi s1 = stringi_init(); stringi_validate(s1, ""); printf("create: s1 = %p\n", s1); printf("\nexpanding s1:\n"); stringi_reverse(s1); stringi_validate(s1, ""); for (int i = 0; i < 13; ++i) { stringi_print(s1); stringi_put(s1, 'a' + i); } stringi_validate(s1, "abcdefghijklm"); stringi_print(s1); stringi s2 = stringi_init(); stringi_validate(s2, ""); printf("\ncreate: s2 = %p\n", s2); for (int i = 13; i < 26; ++i) stringi_put(s2, 'a' + i); stringi_validate(s2, "nopqrstuvwxyz"); stringi_cat(s1, s2); stringi_validate(s1, "abcdefghijklmnopqrstuvwxyz"); stringi_reverse(s1); stringi_validate(s1, "zyxwvutsrqponmlkjihgfedcba"); s2->count = 2; stringi_validate(s2, "no"); stringi_cat(s1, s2); stringi_validate(s1, "zyxwvutsrqponmlkjihgfedcbano"); stringi_cat_rev(s1, s2); stringi_validate(s1, "zyxwvutsrqponmlkjihgfedcbanoon"); stringi_remove(&s1); printf("remove: s1 = %p\n", s1); stringi_remove(&s2); printf("remove: s2 = %p\n", s2); s1 = stringi_init(); stringi_validate(s1, ""); printf("create: s1 = %p\n", s1); s2 = stringi_init(); stringi_validate(s2, ""); printf("create: s2 = %p\n", s2); int diff1 = diff_to_ms(time_putchar(s1)); printf("\ns1 += CAP_MAX * 'c': %d ms\n", diff1); stringi_cat(s1, s2); // expect error: exceeding CAP_MAX // stringi_put(s1, 'c'); s1->count = 0; stringi_validate(s1, ""); printf("clear s1:\n"); stringi_print(s1); stringi_remove(&s1); stringi_reserve(s2, CAP_MAX); printf("\nreserve for s2: CAP_MAX\n"); int diff2 = diff_to_ms(time_putchar(s2)); printf("s2 += CAP_MAX * 'c': %d ms\n", diff2); s2->count = 0; stringi_validate(s2, ""); printf("clear s2:\n"); stringi_print(s2); stringi_remove(&s2); return 0; }
Output
create: s1 = 0000015E9D66F050 expanding s1: '' (0/0) 'a' (1/1) 'ab' (2/2) 'abc' (3/4) 'abcd' (4/4) 'abcde' (5/8) 'abcdef' (6/8) 'abcdefg' (7/8) 'abcdefgh' (8/8) 'abcdefghi' (9/16) 'abcdefghij' (10/16) 'abcdefghijk' (11/16) 'abcdefghijkl' (12/16) 'abcdefghijklm' (13/16) create: s2 = 0000015E9D66EEB0 remove: s1 = 0000000000000000 remove: s2 = 0000000000000000 create: s1 = 0000015E9D66EF90 create: s2 = 0000015E9D66F110 s1 += CAP_MAX * 'c': 1120 ms clear s1: '' (0/987654321) reserve for s2: CAP_MAX s2 += CAP_MAX * 'c': 931 ms clear s2: '' (0/987654321)
I would appreciate any thoughts on performance, readability, or modifiability.
assert
is wrong. In the functionstringi_set_cap
you have the comment "assumes count <= cap <= CAP_MAX". That assumption should be checked with an assert. That is exactly what asserts are for. It'll be there when debugging, so the user can see when they call your functions in the wrong way, but won't be there in production, to not slow down the function.\$\endgroup\$realloc
is far superior to C++new
/delete
which omits any way to grow an existing allocation. Of course you need link-time optimization to inline across C source files to reduce per-push overhead since you put all your tiny functions in a separate.c
instead of directly in the.h
withstatic inline
(orinline
plus a.c
withextern inline
instantiation).\$\endgroup\$std::vector
for 7 billionpush_back
operations. The code above takes56176 ms
without optimizations and18947 ms
with optimizations, whilestd::vector
takes485778 ms
without optimizations and10968 ms
with optimizations. Unfortunately, the optimized executable for my code appears to be causing a ton of branch mispredictions (I have no idea what magic causesstd::vector
to avoid this).\$\endgroup\$-flto
orgcc -fwhole-program
to allow cross-file inlining from yourstringi.c
, if your benchmark was in a different file? Can you link your benchmark code, e.g. on godbolt.org (with compiler version/options you actually used) so I can see what you did and maybe profile it on my own Skylake system? Also what CPU?\$\endgroup\$% 64
every iteration, which costs amovabs r64, imm64
and a 64-bitmul
and some shift / LEA for each byte, which is maybe a similar amount of overhead to the bookkeeping to check if realloc is needed. I'm seeing negligible branch mispredicts withperf stat
on Skylake, miss rate is 0.00%. IPC is 3.41 instructions per clock. It's not getting optimized away despite not doing anything with the results except read the count, at least not by GCC. Clang can be more aggressive about optimizing away alloc/free even for code that writes the to buffer.\$\endgroup\$