Re: [PATCH v6 1/4] last-modified: new subcommand to show when files were last modified

Patrick Steinhardt <ps@xxxxxx> · Thu, 31 Jul 2025 08:42:33 +0200

On Wed, Jul 30, 2025 at 07:55:07PM +0200, Toon Claes wrote:
> diff --git a/Documentation/git-last-modified.adoc b/Documentation/git-last-modified.adoc
> new file mode 100644
> index 0000000000..89138ebeb7
> --- /dev/null
> +++ b/Documentation/git-last-modified.adoc
> @@ -0,0 +1,49 @@
> +git-last-modified(1)
> +====================
> +
> +NAME
> +----
> +git-last-modified - EXPERIMENTAL: Show when files were last modified
> +
> +
> +SYNOPSIS
> +--------
> +[synopsis]
> +git last-modified [-r] [-t] [<revision-range>] [[--] <path>...]

I think we typically list long options here, not the short single-letter
ones.

> +
> +DESCRIPTION
> +-----------
> +
> +Shows which commit last modified each of the relevant files and subdirectories.
> +
> +THIS COMMAND IS EXPERIMENTAL. THE BEHAVIOR MAY CHANGE.
> +
> +OPTIONS
> +-------
> +
> +-r::

-r, --recursive::

> +	Recurse into subtrees.
> +
> +-t::

-t, --tree-in-recursive::

> diff --git a/builtin/last-modified.c b/builtin/last-modified.c
> new file mode 100644
> index 0000000000..e4c73464c7
> --- /dev/null
> +++ b/builtin/last-modified.c
[snip]
> +static int populate_paths_from_revs(struct last_modified *lm)
> +{
> +	int num_interesting = 0;
> +	struct diff_options diffopt;
> +
> +	memcpy(&diffopt, &lm->rev.diffopt, sizeof(diffopt));
> +	copy_pathspec(&diffopt.pathspec, &lm->rev.diffopt.pathspec);
> +	/*
> +	 * Use a callback to populate the paths from revs
> +	 */
> +	diffopt.output_format = DIFF_FORMAT_CALLBACK;
> +	diffopt.format_callback = add_path_from_diff;
> +	diffopt.format_callback_data = lm;

I feel like this whole block could use a comment that explains what
we're doing. Why do we copy `diffopt` around? Why is it fine to free
the struct at the end without unsetting `lm->rev.diffopt`? Couldn't that
cause a double free?

> +	for (size_t i = 0; i < lm->rev.pending.nr; i++) {
> +		struct object_array_entry *obj = lm->rev.pending.objects + i;
> +
> +		if (obj->item->flags & UNINTERESTING)
> +			continue;
> +
> +		if (num_interesting++)
> +			return error(_("last-modified can only operate on one tree at a time"));
> +
> +		diff_tree_oid(lm->rev.repo->hash_algo->empty_tree,
> +			      &obj->item->oid, "", &diffopt);
> +		diff_flush(&diffopt);
> +	}
> +	diff_free(&diffopt);
> +
> +	return 0;
> +}
> +
> +static void last_modified_emit(struct last_modified *lm,
> +			       const char *path, const struct commit *commit)
> +
> +{
> +	if (commit->object.flags & BOUNDARY)
> +		putchar('^');
> +	printf("%s\t", oid_to_hex(&commit->object.oid));
> +
> +	if (lm->rev.diffopt.line_termination)
> +		write_name_quoted(path, stdout, '\n');
> +	else
> +		printf("%s%c", path, '\0');
> +
> +	fflush(stdout);

Is there a reason why we have to explicitly flush output? This command
doesn't have any interactivity with the caller.

> +static void last_modified_diff(struct diff_queue_struct *q,
> +			       struct diff_options *opt UNUSED, void *cbdata)
> +{
> +	struct last_modified_callback_data *data = cbdata;
> +
> +	for (int i = 0; i < q->nr; i++) {
> +		struct diff_filepair *p = q->queue[i];
> +		switch (p->status) {
> +		case DIFF_STATUS_DELETED:
> +			/*
> +			 * There's no point in feeding a deletion, as it could
> +			 * not have resulted in our current state, which
> +			 * actually has the file.
> +			 */
> +			break;
> +
> +		default:
> +			/*
> +			 * Otherwise, we care only that we somehow arrived at
> +			 * a final oid state. Note that this covers some
> +			 * potentially controversial areas, including:
> +			 *
> +			 *  1. A rename or copy will be found, as it is the
> +			 *     first time the content has arrived at the given
> +			 *     path.

Makes sense that we don't handle renames (yet). I think I didn't spot
this in the manual, so maybe this is something we should document there.

> +			 *  2. Even a non-content modification like a mode or
> +			 *     type change will trigger it.

Seems sensible as a default, as well. And likewise, we can add
`--ignore-mode-changes` at a later point if we ever have a use case for
it.

> +			 * We take the inclusive approach for now, and find
> +			 * anything which impacts the path. Options to tweak
> +			 * the behavior (e.g., to "--follow" the content across
> +			 * renames) can come later.
> +			 */
> +			mark_path(p->two->path, &p->two->oid, data);
> +			break;
> +		}
> +	}
> +}
> +
> +static int last_modified_run(struct last_modified *lm)
> +{
> +	struct last_modified_callback_data data = { .lm = lm };
> +
> +	lm->rev.diffopt.output_format = DIFF_FORMAT_CALLBACK;
> +	lm->rev.diffopt.format_callback = last_modified_diff;
> +	lm->rev.diffopt.format_callback_data = &data;
> +
> +	prepare_revision_walk(&lm->rev);
> +
> +	while (hashmap_get_size(&lm->paths)) {
> +		data.commit = get_revision(&lm->rev);
> +		if (!data.commit)
> +			break;

So in this case we have reached the end of our commit range. I assume we
simply print the oldest commit of that range in this case?

> +		if (data.commit->object.flags & BOUNDARY) {
> +			diff_tree_oid(lm->rev.repo->hash_algo->empty_tree,
> +				      &data.commit->object.oid, "",
> +				      &lm->rev.diffopt);
> +			diff_flush(&lm->rev.diffopt);
> +		} else {
> +			log_tree_commit(&lm->rev, data.commit);
> +		}
> +	}
> +
> +	return 0;
> +}
> +
> +static int last_modified_init(struct last_modified *lm, struct repository *r,
> +			      const char *prefix, int argc, const char **argv)
> +{
> +	hashmap_init(&lm->paths, last_modified_entry_hashcmp, NULL, 0);
> +
> +	repo_init_revisions(r, &lm->rev, prefix);
> +	lm->rev.def = "HEAD";
> +	lm->rev.combine_merges = 1;
> +	lm->rev.show_root_diff = 1;
> +	lm->rev.boundary = 1;
> +	lm->rev.no_commit_id = 1;
> +	lm->rev.diff = 1;
> +	lm->rev.diffopt.flags.recursive = lm->recursive || lm->tree_in_recursive;
> +	lm->rev.diffopt.flags.tree_in_recursive = lm->tree_in_recursive;
> +
> +	if ((argc = setup_revisions(argc, argv, &lm->rev, NULL)) > 1) {

Tiny nit: it's rather unusual in our codebase to assign values in
conditionals. I personally don't mind this usage at all -- I think it
can make error handling way less verbose. But I'm not sure whether we
deem this style acceptable.

        argc = setup_revisions(argc, argv, &lm->rev, NULL)
        if (argc) {
            ...
        }

I've seen this style several times in this patch. I think we should keep
our typical style for now, but I wouldn't mind if you sent a patch for
our coding style document so that we can discuss this.

> +		error(_("unknown last-modified argument: %s"), argv[1]);
> +		return argc;
> +	}
> +
> +	if (populate_paths_from_revs(lm) < 0)
> +		return error(_("unable to setup last-modified"));
> +
> +	return 0;
> +}
> +
> +int cmd_last_modified(int argc, const char **argv, const char *prefix,
> +		      struct repository *repo)
> +{
> +	int ret;
> +	struct last_modified lm;
> +
> +	const char * const last_modified_usage[] = {
> +		N_("git last-modified [-r] [-t] "
> +		   "[<revision-range>] [[--] <path>...]"),
> +		NULL
> +	};
> +
> +	struct option last_modified_options[] = {
> +		OPT_BOOL('r', "recursive", &lm.recursive,
> +			 N_("recurse into subtrees")),
> +		OPT_BOOL('t', "tree-in-recursive", &lm.tree_in_recursive,
> +			 N_("recurse into subtrees and include the tree entries too")),

Should this maybe be called something like "--recursive-with-trees"?
"--tree-in-recursive" reads somewhat strange to me.

> +		OPT_END()
> +	};
> +
> +	memset(&lm, 0, sizeof(lm));

You can avoid the `memset()` and directly zero-initialize the struct
when it's declared. Alternatively, you can move this function call into
`last_modified_init()` itself, where it would be more reasonable.

> +	argc = parse_options(argc, argv, prefix, last_modified_options,
> +			     last_modified_usage,
> +			     PARSE_OPT_KEEP_ARGV0 | PARSE_OPT_KEEP_UNKNOWN_OPT);
> +
> +	repo_config(repo, git_default_config, NULL);
> +
> +	if ((ret = last_modified_init(&lm, repo, prefix, argc, argv))) {
> +		if (ret > 0)
> +			usage_with_options(last_modified_usage,
> +					   last_modified_options);
> +		goto out;
> +	}
> +
> +	if ((ret = last_modified_run(&lm)))
> +		goto out;

Two more cases where we assign `if ((ret = ...))`.

Patrick