From 2342838354ff8ffdd2af11d9edcb438e204b0be7 Mon Sep 17 00:00:00 2001 From: Eygene Ryabinkin Date: Wed, 10 Apr 2013 01:30:30 +0400 Subject: [PATCH] Speed up vulnerability audit process Sort VuXML entries using lexicographical ordering of largest non-globbing prefixes and use few tricks to benefit from the sorted array: * select the first entry for the given installed package using precomputed table for the first byte of the package name; * skip entries with identical non-globbing prefix if it does not match the package name; * once we found VuXML entry with the non-globbing prefix that is strictly larger than the prefix of the same size taken from the package name, there is no point to search any further. This gives me up to 5x decrease in total processing time for 'pkg audit'. Signed-off-by: Eygene Ryabinkin --- pkg/audit.c | 210 +++++++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 195 insertions(+), 15 deletions(-) diff --git a/pkg/audit.c b/pkg/audit.c index cd63bfa..55496d1 100644 --- a/pkg/audit.c +++ b/pkg/audit.c @@ -65,6 +65,52 @@ struct audit_entry { SLIST_HEAD(audit_head, audit_entry); +/* + * The _sorted stuff. + * + * We are using the optimized search based on the following observations: + * + * - number of VuXML entries is more likely to be far greater than + * the number of installed ports; thus we should try to optimize + * the walk through all entries for a given port; + * + * - fnmatch() is good and fast, but if we will compare the audit entry + * name prefix without globbing characters to the prefix of port name + * of the same length and they are different, there is no point to + * check the rest; + * + * - (most important bit): if parsed VuXML entries are lexicographically + * sorted per the largest prefix with no globbing characters and we + * know how many succeeding entries have the same prefix we can + * + * a. skip the rest of the entries once the non-globbing prefix is + * lexicographically larger than the port name prefix of the + * same length: all successive prefixes will be larger as well; + * + * b. if we have non-globbing prefix that is lexicographically smaller + * than port name prefix, we can skip all succeeding entries with + * the same prefix; and as some port names tend to repeat due to + * multiple vulnerabilities, it could be a large win. + */ +struct audit_entry_sorted { + struct audit_entry *e; /* Entry itself */ + size_t noglob_len; /* Prefix without glob characters */ + size_t next_pfx_incr; /* Index increment for the entry with + different prefix */ +}; + +/* + * Another small optimization to skip the beginning of the + * VuXML entry array, if possible. + * + * audit_entry_first_byte_idx[ch] represents the index + * of the first VuXML entry in the sorted array that has + * its non-globbing prefix that is started with the character + * 'ch'. It allows to skip entries from the beginning of the + * VuXML array that aren't relevant for the checked port name. + */ +static size_t audit_entry_first_byte_idx[256]; + void usage_audit(void) { @@ -251,6 +297,119 @@ parse_db(const char *path, struct audit_head *h) return EPKG_OK; } +/* + * Returns the length of the largest prefix without globbing + * characters, as per fnmatch(). + */ +static size_t +str_noglob_len(const char *s) +{ + size_t n; + + for (n = 0; s[n] && s[n] != '*' && s[n] != '?' && + s[n] != '[' && s[n] != '{' && s[n] != '\\'; n++); + + return n; +} + +/* + * Helper for quicksort that lexicographically orders prefixes. + */ +static int +audit_entry_compare(const void *a, const void *b) +{ + struct audit_entry_sorted *e1, *e2; + size_t min_len; + int result; + + e1 = (struct audit_entry_sorted *)a; + e2 = (struct audit_entry_sorted *)b; + + min_len = (e1->noglob_len < e2->noglob_len ? + e1->noglob_len : e2->noglob_len); + result = strncmp(e1->e->pkgname, e2->e->pkgname, min_len); + /* + * Additional check to see if some word is a prefix of an + * another one and, thus, should go before the former. + */ + if (result == 0) { + if (e1->noglob_len < e2->noglob_len) + result = -1; + else if (e1->noglob_len > e2->noglob_len) + result = 1; + } + + return (result); +} + +/* + * Sorts VuXML entries and calculates increments to jump to the + * next distinct prefix. + */ +static struct audit_entry_sorted * +preprocess_db(struct audit_head *h) +{ + struct audit_entry *e; + struct audit_entry_sorted *ret; + size_t i, n, tofill; + + n = 0; + SLIST_FOREACH(e, h, next) + n++; + + ret = (struct audit_entry_sorted *)calloc(n + 1, sizeof(ret[0])); + if (ret == NULL) + err(1, "calloc(audit_entry_sorted*)"); + bzero((void *)ret, (n + 1) * sizeof(ret[0])); + + n = 0; + SLIST_FOREACH(e, h, next) { + ret[n].e = e; + ret[n].noglob_len = str_noglob_len(e->pkgname); + ret[n].next_pfx_incr = 1; + n++; + } + + qsort(ret, n, sizeof(*ret), audit_entry_compare); + + /* + * Determining jump indexes to the next different prefix. + * Only non-1 increments are calculated there. + * + * Due to the current usage that picks only increment for the + * first of the non-unique prefixes in a row, we could + * calculate only that one and skip calculations for the + * succeeding, but for the uniformity and clarity we're + * calculating 'em all. + */ + for (n = 1, tofill = 0; ret[n].e; n++) { + if (ret[n - 1].noglob_len != ret[n].noglob_len) { + struct audit_entry_sorted *base; + + base = ret + n - tofill; + for (i = 0; tofill > 1; i++, tofill--) + base[i].next_pfx_incr = tofill; + tofill = 1; + } else if (strcmp(ret[n - 1].e->pkgname, + ret[n].e->pkgname) == 0) { + tofill++; + } else { + tofill = 1; + } + } + + /* Calculate jump indexes for the first byte of the package name */ + bzero(audit_entry_first_byte_idx, sizeof(audit_entry_first_byte_idx)); + for (n = 1, i = 0; n < 256; n++) { + while (ret[i].e != NULL && + (size_t)(ret[i].e->pkgname[0]) < n) + i++; + audit_entry_first_byte_idx[n] = i; + } + + return (ret); +} + static bool match_version(const char *pkgversion, struct version_entry *v) { @@ -281,7 +440,7 @@ match_version(const char *pkgversion, struct version_entry *v) } static bool -is_vulnerable(struct audit_head *h, struct pkg *pkg) +is_vulnerable(struct audit_entry_sorted *a, struct pkg *pkg) { struct audit_entry *e; const char *pkgname; @@ -293,20 +452,38 @@ is_vulnerable(struct audit_head *h, struct pkg *pkg) PKG_VERSION, &pkgversion ); - SLIST_FOREACH(e, h, next) { - if (fnmatch(e->pkgname, pkgname, 0) != 0) + a += audit_entry_first_byte_idx[(size_t)pkgname[0]]; + for (; (e = a->e) != NULL; a += a->next_pfx_incr) { + int cmp; + size_t i; + + /* + * Audit entries are sorted, so if we had found one + * that is lexicographically greater than our name, + * it and the rest won't match our name. + */ + cmp = strncmp(pkgname, e->pkgname, a->noglob_len); + if (cmp > 0) continue; + else if (cmp < 0) + break; - res1 = match_version(pkgversion, &e->v1); - res2 = match_version(pkgversion, &e->v2); - if (res1 && res2) { - res = true; - if (quiet) { - printf("%s-%s\n", pkgname, pkgversion); - } else { - printf("%s-%s is vulnerable:\n", pkgname, pkgversion); - printf("%s\n", e->desc); - printf("WWW: %s\n\n", e->url); + for (i = 0; i < a->next_pfx_incr; i++) { + e = a[i].e; + if (fnmatch(e->pkgname, pkgname, 0) != 0) + continue; + + res1 = match_version(pkgversion, &e->v1); + res2 = match_version(pkgversion, &e->v2); + if (res1 && res2) { + res = true; + if (quiet) { + printf("%s-%s\n", pkgname, pkgversion); + } else { + printf("%s-%s is vulnerable:\n", pkgname, pkgversion); + printf("%s\n", e->desc); + printf("WWW: %s\n\n", e->url); + } } } } @@ -333,6 +510,7 @@ int exec_audit(int argc, char **argv) { struct audit_head h = SLIST_HEAD_INITIALIZER(); + struct audit_entry_sorted *cooked_audit_entries = NULL; struct pkgdb *db = NULL; struct pkgdb_it *it = NULL; struct pkg *pkg = NULL; @@ -403,7 +581,8 @@ exec_audit(int argc, char **argv) ret = EX_DATAERR; goto cleanup; } - is_vulnerable(&h, pkg); + cooked_audit_entries = preprocess_db(&h); + is_vulnerable(cooked_audit_entries, pkg); goto cleanup; } @@ -441,9 +620,10 @@ exec_audit(int argc, char **argv) ret = EX_DATAERR; goto cleanup; } + cooked_audit_entries = preprocess_db(&h); while ((ret = pkgdb_it_next(it, &pkg, PKG_LOAD_BASIC)) == EPKG_OK) - if (is_vulnerable(&h, pkg)) + if (is_vulnerable(cooked_audit_entries, pkg)) vuln++; if (ret == EPKG_END && vuln == 0) -- 1.8.1.3