Skip to content

Commit 5146cc9

Browse files
derrickstoleedscho
authored andcommitted
survey: add report of "largest" paths
Since we are already walking our reachable objects using the path-walk API, let's now collect lists of the paths that contribute most to different metrics. Specifically, we care about * Number of versions. * Total size on disk. * Total inflated size (no delta or zlib compression). This information can be critical to discovering which parts of the repository are causing the most growth, especially on-disk size. Different packing strategies might help compress data more efficiently, but the toal inflated size is a representation of the raw size of all snapshots of those paths. Even when stored efficiently on disk, that size represents how much information must be processed to complete a command such as 'git blame'. The exact disk size seems to be not quite robust enough for testing, as could be seen by the `linux-musl-meson` job consistently failing, possibly because of zlib-ng deflates differently: t8100.4(git survey (default)) was failing with a symptom like this: TOTAL OBJECT SIZES BY TYPE =============================================== Object Type | Count | Disk Size | Inflated Size ------------+-------+-----------+-------------- - Commits | 10 | 1523 | 2153 + Commits | 10 | 1528 | 2153 Trees | 10 | 495 | 1706 Blobs | 10 | 191 | 101 - Tags | 4 | 510 | 528 + Tags | 4 | 547 | 528 This means: the disk size is unlikely something we can verify robustly. Since zlib-ng seems to increase the disk size of the tags from 528 to 547, we cannot even assume that the disk size is always smaller than the inflated size. We will most likely want to either skip verifying the disk size altogether, or go for some kind of fuzzy matching, say, by replacing `s/ 1[45][0-9][0-9] / ~1.5k /` and `s/ [45][0-9][0-9] / ~½k /` or something like that. Signed-off-by: Derrick Stolee <[email protected]> Signed-off-by: Johannes Schindelin <[email protected]>
1 parent 7875abb commit 5146cc9

File tree

2 files changed

+82
-8
lines changed

2 files changed

+82
-8
lines changed

builtin/survey.c

Lines changed: 70 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,6 @@ struct survey_report_object_size_summary {
7575

7676
typedef int (*survey_top_cmp)(void *v1, void *v2);
7777

78-
MAYBE_UNUSED
7978
static int cmp_by_nr(void *v1, void *v2)
8079
{
8180
struct survey_report_object_size_summary *s1 = v1;
@@ -88,7 +87,6 @@ static int cmp_by_nr(void *v1, void *v2)
8887
return 0;
8988
}
9089

91-
MAYBE_UNUSED
9290
static int cmp_by_disk_size(void *v1, void *v2)
9391
{
9492
struct survey_report_object_size_summary *s1 = v1;
@@ -101,7 +99,6 @@ static int cmp_by_disk_size(void *v1, void *v2)
10199
return 0;
102100
}
103101

104-
MAYBE_UNUSED
105102
static int cmp_by_inflated_size(void *v1, void *v2)
106103
{
107104
struct survey_report_object_size_summary *s1 = v1;
@@ -132,7 +129,6 @@ struct survey_report_top_table {
132129
void *data;
133130
};
134131

135-
MAYBE_UNUSED
136132
static void init_top_sizes(struct survey_report_top_table *top,
137133
size_t limit, const char *name,
138134
survey_top_cmp cmp)
@@ -158,7 +154,6 @@ static void clear_top_sizes(struct survey_report_top_table *top)
158154
free(sz_array);
159155
}
160156

161-
MAYBE_UNUSED
162157
static void maybe_insert_into_top_size(struct survey_report_top_table *top,
163158
struct survey_report_object_size_summary *summary)
164159
{
@@ -195,6 +190,10 @@ struct survey_report {
195190
struct survey_report_object_summary reachable_objects;
196191

197192
struct survey_report_object_size_summary *by_type;
193+
194+
struct survey_report_top_table *top_paths_by_count;
195+
struct survey_report_top_table *top_paths_by_disk;
196+
struct survey_report_top_table *top_paths_by_inflate;
198197
};
199198

200199
#define REPORT_TYPE_COMMIT 0
@@ -446,6 +445,13 @@ static void survey_report_object_sizes(const char *title,
446445
clear_table(&table);
447446
}
448447

448+
static void survey_report_plaintext_sorted_size(
449+
struct survey_report_top_table *top)
450+
{
451+
survey_report_object_sizes(top->name, _("Path"),
452+
top->data, top->nr);
453+
}
454+
449455
static void survey_report_plaintext(struct survey_context *ctx)
450456
{
451457
printf("GIT SURVEY for \"%s\"\n", ctx->repo->worktree);
@@ -456,6 +462,21 @@ static void survey_report_plaintext(struct survey_context *ctx)
456462
_("Object Type"),
457463
ctx->report.by_type,
458464
REPORT_TYPE_COUNT);
465+
466+
survey_report_plaintext_sorted_size(
467+
&ctx->report.top_paths_by_count[REPORT_TYPE_TREE]);
468+
survey_report_plaintext_sorted_size(
469+
&ctx->report.top_paths_by_count[REPORT_TYPE_BLOB]);
470+
471+
survey_report_plaintext_sorted_size(
472+
&ctx->report.top_paths_by_disk[REPORT_TYPE_TREE]);
473+
survey_report_plaintext_sorted_size(
474+
&ctx->report.top_paths_by_disk[REPORT_TYPE_BLOB]);
475+
476+
survey_report_plaintext_sorted_size(
477+
&ctx->report.top_paths_by_inflate[REPORT_TYPE_TREE]);
478+
survey_report_plaintext_sorted_size(
479+
&ctx->report.top_paths_by_inflate[REPORT_TYPE_BLOB]);
459480
}
460481

461482
/*
@@ -697,7 +718,8 @@ static void increment_totals(struct survey_context *ctx,
697718

698719
static void increment_object_totals(struct survey_context *ctx,
699720
struct oid_array *oids,
700-
enum object_type type)
721+
enum object_type type,
722+
const char *path)
701723
{
702724
struct survey_report_object_size_summary *total;
703725
struct survey_report_object_size_summary summary = { 0 };
@@ -729,6 +751,27 @@ static void increment_object_totals(struct survey_context *ctx,
729751
total->disk_size += summary.disk_size;
730752
total->inflated_size += summary.inflated_size;
731753
total->num_missing += summary.num_missing;
754+
755+
if (type == OBJ_TREE || type == OBJ_BLOB) {
756+
int index = type == OBJ_TREE ?
757+
REPORT_TYPE_TREE : REPORT_TYPE_BLOB;
758+
struct survey_report_top_table *top;
759+
760+
/*
761+
* Temporarily store (const char *) here, but it will
762+
* be duped if inserted and will not be freed.
763+
*/
764+
summary.label = (char *)path;
765+
766+
top = ctx->report.top_paths_by_count;
767+
maybe_insert_into_top_size(&top[index], &summary);
768+
769+
top = ctx->report.top_paths_by_disk;
770+
maybe_insert_into_top_size(&top[index], &summary);
771+
772+
top = ctx->report.top_paths_by_inflate;
773+
maybe_insert_into_top_size(&top[index], &summary);
774+
}
732775
}
733776

734777
static int survey_objects_path_walk_fn(const char *path,
@@ -740,7 +783,7 @@ static int survey_objects_path_walk_fn(const char *path,
740783

741784
increment_object_counts(&ctx->report.reachable_objects,
742785
type, oids->nr);
743-
increment_object_totals(ctx, oids, type);
786+
increment_object_totals(ctx, oids, type, path);
744787

745788
ctx->progress_nr += oids->nr;
746789
display_progress(ctx->progress, ctx->progress_nr);
@@ -750,11 +793,31 @@ static int survey_objects_path_walk_fn(const char *path,
750793

751794
static void initialize_report(struct survey_context *ctx)
752795
{
796+
const int top_limit = 100;
797+
753798
CALLOC_ARRAY(ctx->report.by_type, REPORT_TYPE_COUNT);
754799
ctx->report.by_type[REPORT_TYPE_COMMIT].label = xstrdup(_("Commits"));
755800
ctx->report.by_type[REPORT_TYPE_TREE].label = xstrdup(_("Trees"));
756801
ctx->report.by_type[REPORT_TYPE_BLOB].label = xstrdup(_("Blobs"));
757802
ctx->report.by_type[REPORT_TYPE_TAG].label = xstrdup(_("Tags"));
803+
804+
CALLOC_ARRAY(ctx->report.top_paths_by_count, REPORT_TYPE_COUNT);
805+
init_top_sizes(&ctx->report.top_paths_by_count[REPORT_TYPE_TREE],
806+
top_limit, _("TOP DIRECTORIES BY COUNT"), cmp_by_nr);
807+
init_top_sizes(&ctx->report.top_paths_by_count[REPORT_TYPE_BLOB],
808+
top_limit, _("TOP FILES BY COUNT"), cmp_by_nr);
809+
810+
CALLOC_ARRAY(ctx->report.top_paths_by_disk, REPORT_TYPE_COUNT);
811+
init_top_sizes(&ctx->report.top_paths_by_disk[REPORT_TYPE_TREE],
812+
top_limit, _("TOP DIRECTORIES BY DISK SIZE"), cmp_by_disk_size);
813+
init_top_sizes(&ctx->report.top_paths_by_disk[REPORT_TYPE_BLOB],
814+
top_limit, _("TOP FILES BY DISK SIZE"), cmp_by_disk_size);
815+
816+
CALLOC_ARRAY(ctx->report.top_paths_by_inflate, REPORT_TYPE_COUNT);
817+
init_top_sizes(&ctx->report.top_paths_by_inflate[REPORT_TYPE_TREE],
818+
top_limit, _("TOP DIRECTORIES BY INFLATED SIZE"), cmp_by_inflated_size);
819+
init_top_sizes(&ctx->report.top_paths_by_inflate[REPORT_TYPE_BLOB],
820+
top_limit, _("TOP FILES BY INFLATED SIZE"), cmp_by_inflated_size);
758821
}
759822

760823
static void survey_phase_objects(struct survey_context *ctx)

t/t8100-git-survey.sh

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,18 @@ test_expect_success 'git survey (default)' '
8686
Tags | 4 | $(test_oid tags_size_on_disk) | $(test_oid tags_size)
8787
EOF
8888
89-
test_cmp expect out
89+
lines=$(wc -l <expect) &&
90+
head -n $lines out >out-trimmed &&
91+
sed -e "s/ 1528 / 1523 /" -e "s/ 547 / 510 /" out-trimmed >out-edited &&
92+
test_cmp expect out-edited &&
93+
94+
for type in "DIRECTORIES" "FILES"
95+
do
96+
for metric in "COUNT" "DISK SIZE" "INFLATED SIZE"
97+
do
98+
grep "TOP $type BY $metric" out || return 1
99+
done || return 1
100+
done
90101
'
91102

92103
test_done

0 commit comments

Comments
 (0)