Re: [PATCH RFC v2 1/5] last-modified: new subcommand to show when files were last modified

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Patrick Steinhardt <ps@xxxxxx> writes:

> On Fri, May 23, 2025 at 11:33:48AM +0200, Toon Claes wrote:
>> diff --git a/Documentation/git-last-modified.adoc b/Documentation/git-last-modified.adoc
>> new file mode 100644
>> index 0000000000..1af38f402e
>> --- /dev/null
>> +++ b/Documentation/git-last-modified.adoc
>> @@ -0,0 +1,49 @@
>> +git-last-modified(1)
>> +====================
>> +
>> +NAME
>> +----
>> +git-last-modified - EXPERIMENTAL: Show when files were last modified
>
> Nit: we don't have the EXPERIMENTAL label here for git-switch(1) or
> git-restore(1).

But we do for `git-replay(1)`. Because I haven't gotten much feedback
about the usage of the command, I wanted to be on the safe side and not
commit to the behavior. Marking it EXPERIMENTAL would allow us to make
changes on it's interface without _breaking_. But I wouldn't mind
dropping the experimental status.

>> +
>> +
>> +SYNOPSIS
>> +--------
>> +[synopsis]
>> +git last-modified [-r] [<revision-range>] [[--] <path>...]
>> +
>> +DESCRIPTION
>> +-----------
>> +
>> +Shows which commit last modified each of the relevant files and subdirectories.
>> +
>> +THIS COMMAND IS EXPERIMENTAL. THE BEHAVIOR MAY CHANGE.
>> +
>> +OPTIONS
>> +-------
>> +
>> +-r::
>> +	Recurse into subtrees.
>> +
>> +-t::
>> +	Show tree entry itself as well as subtrees.  Implies `-r`.
>
> These flags aren't yet supported in this version, are they?

They are, but I see there are no tests for `-t`. I shall add them.

>
>> diff --git a/builtin/last-modified.c b/builtin/last-modified.c
>> new file mode 100644
>> index 0000000000..0d4733f666
>> --- /dev/null
>> +++ b/builtin/last-modified.c
>> @@ -0,0 +1,43 @@
>> +#include "git-compat-util.h"
>> +#include "last-modified.h"
>> +#include "hex.h"
>> +#include "quote.h"
>> +#include "config.h"
>> +#include "object-name.h"
>> +#include "parse-options.h"
>> +#include "builtin.h"
>> +
>> +static void show_entry(const char *path, const struct commit *commit, void *d)
>> +{
>> +	struct last_modified *lm = d;
>> +
>> +	if (commit->object.flags & BOUNDARY)
>> +		putchar('^');
>> +	printf("%s\t", oid_to_hex(&commit->object.oid));
>> +
>> +	if (lm->rev.diffopt.line_termination)
>> +		write_name_quoted(path, stdout, '\n');
>> +	else
>> +		printf("%s%c", path, '\0');
>> +
>> +	fflush(stdout);
>> +}
>> +
>> +int cmd_last_modified(int argc,
>> +		   const char **argv,
>> +		   const char *prefix,
>> +		   struct repository *repo)
>> +{
>> +	int ret = 0;
>
> `ret` is basically unused here, we only use it to return 0.

Good catch. Thanks!

>> diff --git a/last-modified.c b/last-modified.c
>> new file mode 100644
>> index 0000000000..9283f8fcae
>> --- /dev/null
>> +++ b/last-modified.c
>> @@ -0,0 +1,213 @@
>> +#include "git-compat-util.h"
>> +#include "last-modified.h"
>> +#include "commit.h"
>> +#include "diffcore.h"
>> +#include "diff.h"
>> +#include "object.h"
>> +#include "revision.h"
>> +#include "repository.h"
>> +#include "log-tree.h"
>> +
>> +struct last_modified_entry {
>> +	struct hashmap_entry hashent;
>> +	struct object_id oid;
>> +	struct commit *commit;
>> +	const char path[FLEX_ARRAY];
>> +};
>> +
>> +static void add_from_diff(struct diff_queue_struct *q,
>> +			  struct diff_options *opt UNUSED,
>> +			  void *data)
>> +{
>> +	struct last_modified *lm = data;
>> +
>> +	for (int i = 0; i < q->nr; i++) {
>> +		struct diff_filepair *p = q->queue[i];
>> +		struct last_modified_entry *ent;
>> +		const char *path = p->two->path;
>> +
>> +		FLEX_ALLOC_STR(ent, path, path);
>> +		oidcpy(&ent->oid, &p->two->oid);
>> +		hashmap_entry_init(&ent->hashent, strhash(ent->path));
>> +		hashmap_add(&lm->paths, &ent->hashent);
>> +	}
>> +}
>> 
>> +static int add_from_revs(struct last_modified *lm)
>> +{
>> +	size_t count = 0;
>> +	struct diff_options diffopt;
>> +
>> +	memcpy(&diffopt, &lm->rev.diffopt, sizeof(diffopt));
>> +	copy_pathspec(&diffopt.pathspec, &lm->rev.diffopt.pathspec);
>> +	diffopt.output_format = DIFF_FORMAT_CALLBACK;
>> +	diffopt.format_callback = add_from_diff;
>> +	diffopt.format_callback_data = lm;
>
> As far as I understand we populate `paths` from the diff here, and
> `paths` later on acts as a filter of paths we're interested in? Might be
> nice to add a comment explaining the intent of this.

Will do.


>> +	for (size_t i = 0; i < lm->rev.pending.nr; i++) {
>> +		struct object_array_entry *obj = lm->rev.pending.objects + i;
>> +
>> +		if (obj->item->flags & UNINTERESTING)
>> +			continue;
>> +
>> +		if (count++)
>> +			return error(_("can only get last-modified one tree at a time"));
>
> It's a bit funny that `count` is pretending to be a counter even though
> it ultimately is only a boolean flag whether we have already seen an
> interesting item.

Okay, I'll refactor.

>> +		diff_tree_oid(lm->rev.repo->hash_algo->empty_tree,
>> +			      &obj->item->oid, "", &diffopt);
>> +		diff_flush(&diffopt);
>> +	}
>> +	clear_pathspec(&diffopt.pathspec);
>
> Shouldn't we call `diff_free()` instead of `clear_pathspec` to clear the
> whole `struct diff_options`?

Yeah, that's better.

>> +
>> +	return 0;
>> +}
>> +
>> +static int last_modified_entry_hashcmp(const void *unused UNUSED,
>> +				    const struct hashmap_entry *hent1,
>> +				    const struct hashmap_entry *hent2,
>> +				    const void *path)
>> +{
>> +	const struct last_modified_entry *ent1 =
>> +		container_of(hent1, const struct last_modified_entry, hashent);
>> +	const struct last_modified_entry *ent2 =
>> +		container_of(hent2, const struct last_modified_entry, hashent);
>> +	return strcmp(ent1->path, path ? path : ent2->path);
>> +}
>> +
>> +void last_modified_init(struct last_modified *lm,
>> +		     struct repository *r,
>> +		     const char *prefix,
>> +		     int argc, const char **argv)
>> +{
>> +	memset(lm, 0, sizeof(*lm));
>> +	hashmap_init(&lm->paths, last_modified_entry_hashcmp, NULL, 0);
>> +
>> +	repo_init_revisions(r, &lm->rev, prefix);
>> +	lm->rev.def = "HEAD";
>> +	lm->rev.combine_merges = 1;
>> +	lm->rev.show_root_diff = 1;
>> +	lm->rev.boundary = 1;
>> +	lm->rev.no_commit_id = 1;
>> +	lm->rev.diff = 1;
>> +	if (setup_revisions(argc, argv, &lm->rev, NULL) > 1)
>> +		die(_("unknown last-modified argument: %s"), argv[1]);
>> +
>> +	if (add_from_revs(lm) < 0)
>> +		die(_("unable to setup last-modified"));
>
> Given that this is library code, do we rather want to have
> `last_modified_init()` return an error code and let the caller die?

Yes, agreed.

>> +}
>> +
>> +void last_modified_release(struct last_modified *lm)
>> +{
>> +	hashmap_clear_and_free(&lm->paths, struct last_modified_entry, hashent);
>> +	release_revisions(&lm->rev);
>> +}
>> +
>> +struct last_modified_callback_data {
>> +	struct commit *commit;
>> +	struct hashmap *paths;
>> +
>> +	last_modified_callback callback;
>> +	void *callback_data;
>> +};
>> +
>> +static void mark_path(const char *path, const struct object_id *oid,
>> +		      struct last_modified_callback_data *data)
>> +{
>> +	struct last_modified_entry *ent;
>> +
>> +	/* Is it even a path that we are interested in? */
>> +	ent = hashmap_get_entry_from_hash(data->paths, strhash(path), path,
>> +					  struct last_modified_entry, hashent);
>> +	if (!ent)
>> +		return;
>
> Yup, so this here is the filter to figure out whether we care for a
> path, which uses the `paths` map we have populated at the beginning.
>
>> +	/* Have we already found a commit? */
>> +	if (ent->commit)
>> +		return;
>
> Can this case even be hit? We remove the entry from the map once we have
> seen it, so I'd expect that we never hit the same commit map entry
> twice. If so, can this be converted to a `BUG()` or am I missing the
> obvious?

I was wondering about that. But took it over from the original code, and
left it in. I believe it's from a version where entries weren't removed
from the hashmap. I agree a `BUG()` would be a better approach, or
even better simply delete this condition.

>> +	/*
>> +	 * Is it arriving at a version of interest, or is it from a side branch
>> +	 * which did not contribute to the final state?
>> +	 */
>> +	if (!oideq(oid, &ent->oid))
>> +		return;
>> +
>> +	ent->commit = data->commit;
>> +	if (data->callback)
>> +		data->callback(path, data->commit, data->callback_data);
>> +
>> +	hashmap_remove(data->paths, &ent->hashent, path);
>
> And we end up removing that entry from paths so that we don't revisit it
> in the future. After all, we're only interested in a single commit per
> path.

True. So it doesn't even make sense a `struct last_modified_entry` has a
`commit` attribute. I will delete that in next version.

>> +	free(ent);
>> +}
>> +
>> +static void last_modified_diff(struct diff_queue_struct *q,
>> +		       struct diff_options *opt UNUSED, void *cbdata)
>> +{
>> +	struct last_modified_callback_data *data = cbdata;
>> +
>> +	for (int i = 0; i < q->nr; i++) {
>> +		struct diff_filepair *p = q->queue[i];
>> +		switch (p->status) {
>> +		case DIFF_STATUS_DELETED:
>> +			/*
>> +			 * There's no point in feeding a deletion, as it could
>> +			 * not have resulted in our current state, which
>> +			 * actually has the file.
>> +			 */
>> +			break;
>> +
>> +		default:
>> +			/*
>> +			 * Otherwise, we care only that we somehow arrived at
>> +			 * a final path/sha1 state. Note that this covers some
>> +			 * potentially controversial areas, including:
>> +			 *
>> +			 *  1. A rename or copy will be found, as it is the
>> +			 *     first time the content has arrived at the given
>> +			 *     path.
>> +			 *
>> +			 *  2. Even a non-content modification like a mode or
>> +			 *     type change will trigger it.
>
> Curious, but sensible. We're looking for the last time a specific tree
> entry was changed, and that of course includes modifications. I could
> totally see that we may eventually want to add a flag that ignores such
> mode changes and only presents content changes. But for now I agree that
> this is sensible.
>
>> +			 * We take the inclusive approach for now, and find
>> +			 * anything which impacts the path. Options to tweak
>> +			 * the behavior (e.g., to "--follow" the content across
>> +			 * renames) can come later.
>> +			 */
>> +			mark_path(p->two->path, &p->two->oid, data);
>> +			break;
>> +		}
>> +	}
>> +}
>> +
>> +int last_modified_run(struct last_modified *lm, last_modified_callback cb, void *cbdata)
>> +{
>> +	struct last_modified_callback_data data;
>> +
>> +	data.paths = &lm->paths;
>> +	data.callback = cb;
>> +	data.callback_data = cbdata;
>> +
>> +	lm->rev.diffopt.output_format = DIFF_FORMAT_CALLBACK;
>> +	lm->rev.diffopt.format_callback = last_modified_diff;
>> +	lm->rev.diffopt.format_callback_data = &data;
>> +
>> +	prepare_revision_walk(&lm->rev);
>> +
>> +	while (hashmap_get_size(&lm->paths)) {
>
> Okay, and this is the core of our logic: we continue walking the tree
> until there are no more paths that we care about.
>
>> diff --git a/last-modified.h b/last-modified.h
>> new file mode 100644
>> index 0000000000..42a819d979
>> --- /dev/null
>> +++ b/last-modified.h
>> @@ -0,0 +1,27 @@
>> +#ifndef LAST_MODIFIED_H
>> +#define LAST_MODIFIED_H
>> +
>> +#include "commit.h"
>> +#include "revision.h"
>> +#include "hashmap.h"
>> +
>> +struct last_modified {
>> +	struct hashmap paths;
>> +	struct rev_info rev;
>> +};
>> +
>> +void last_modified_init(struct last_modified *lm,
>> +		     struct repository *r,
>> +		     const char *prefix,
>> +		     int argc, const char **argv);
>> +
>> +void last_modified_release(struct last_modified *);
>> +
>> +typedef void (*last_modified_callback)(const char *path,
>> +				    const struct commit *commit,
>> +				    void *data);
>> +int last_modified_run(struct last_modified *lm,
>> +		   last_modified_callback cb,
>> +		   void *cbdata);
>> +#endif /* LAST_MODIFIED_H */
>
> It would be nice to have some documentation for each of these functions
> as well as a bit of a higher-level conceptual info.

Will do.

-- 
Cheers,
Toon




[Index of Archives]     [Linux Kernel Development]     [Gcc Help]     [IETF Annouce]     [DCCP]     [Netdev]     [Networking]     [Security]     [V4L]     [Bugtraq]     [Yosemite]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Linux SCSI]     [Fedora Users]

  Powered by Linux