Some cleanup and parallel gix feature

master^2
Wynd 2025-01-19 00:15:43 +02:00
parent 54d36fafcb
commit 21e0247d7b
3 changed files with 132 additions and 336 deletions

11
Cargo.lock generated
View File

@ -234,6 +234,15 @@ dependencies = [
"cfg-if", "cfg-if",
] ]
[[package]]
name = "crossbeam-channel"
version = "0.5.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "06ba6d68e24814cb8de6bb986db8222d3a027d15872cabc0d18817bc3c0e4471"
dependencies = [
"crossbeam-utils",
]
[[package]] [[package]]
name = "crossbeam-deque" name = "crossbeam-deque"
version = "0.8.6" version = "0.8.6"
@ -536,12 +545,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ac7045ac9fe5f9c727f38799d002a7ed3583cd777e3322a7c4b43e3cf437dc69" checksum = "ac7045ac9fe5f9c727f38799d002a7ed3583cd777e3322a7c4b43e3cf437dc69"
dependencies = [ dependencies = [
"crc32fast", "crc32fast",
"crossbeam-channel",
"flate2", "flate2",
"gix-hash", "gix-hash",
"gix-trace", "gix-trace",
"gix-utils", "gix-utils",
"libc", "libc",
"once_cell", "once_cell",
"parking_lot",
"prodash", "prodash",
"sha1_smol", "sha1_smol",
"thiserror", "thiserror",

View File

@ -21,7 +21,10 @@ bench = false
unsafe_code = { level = "forbid" } unsafe_code = { level = "forbid" }
[dependencies] [dependencies]
gix = { version = "0.66.0", default-features = false, features = ["mailmap"] } gix = { version = "0.66.0", default-features = false, features = [
"mailmap",
"parallel",
] }
clap = { version = "4.5.20", features = ["derive"] } clap = { version = "4.5.20", features = ["derive"] }
chrono = { version = "0.4.38" } chrono = { version = "0.4.38" }
itertools = { version = "0.13.0" } itertools = { version = "0.13.0" }

View File

@ -2,7 +2,6 @@
use std::{ use std::{
cmp::Reverse, cmp::Reverse,
collections::HashSet,
path::{self, PathBuf}, path::{self, PathBuf},
sync::OnceLock, sync::OnceLock,
}; };
@ -103,7 +102,7 @@ pub fn get_commits(
start_date: NaiveDate, start_date: NaiveDate,
end_date: NaiveDate, end_date: NaiveDate,
) -> anyhow::Result<(usize, usize, Vec<Commit>)> { ) -> anyhow::Result<(usize, usize, Vec<Commit>)> {
let mut commits: HashSet<Commit> = HashSet::new(); let mut commits: Vec<Commit> = vec![];
let ignored_repos = args.ignored_repos.as_ref().unwrap_or(&vec![]).to_owned(); let ignored_repos = args.ignored_repos.as_ref().unwrap_or(&vec![]).to_owned();
@ -150,363 +149,146 @@ pub fn get_commits(
let mut branches_count: usize = 0; let mut branches_count: usize = 0;
for (i, repo_path) in repos.iter().enumerate() { for (i, repo_path) in repos.iter().enumerate() {
// let repo = ThreadSafeRepository::open(repo_path) let repo = ThreadSafeRepository::open(repo_path).unwrap();
// .unwrap() // let repo = gix::open(repo_path).unwrap();
// .to_thread_local();
let repo = gix::open(repo_path).unwrap();
let branch_names = &*branches[i]; let branch_names = &*branches[i];
let mut branches = vec![]; let branches = get_repo_branches(&repo, branch_names).unwrap();
if branch_names.is_empty() {
branches = repo
.references()?
.prefixed("refs/heads")?
.filter_map(Result::ok)
.filter_map(|b| {
b.inner
.name
.to_string()
.strip_prefix("refs/heads/")
.map(|s| s.to_string())
})
.collect_vec();
}
else {
let branch_names = branch_names.split(' ').map(|s| s.to_string());
branches.extend(branch_names);
}
let mailmap = Mailmap::new(repo_path); let mailmap = Mailmap::new(repo_path);
let branch_commits: Vec<Commit> = branches let branch_commits: Vec<_> = branches
.par_iter() .par_iter()
.filter_map(|branch| { .filter_map(|branch| get_commit_ids(&repo, branch, start_date))
let mut repo_commits: Vec<Commit> = vec![];
let repo = ThreadSafeRepository::open(repo_path)
.unwrap()
.to_thread_local();
// When passing the default @ (HEAD) branch this might actually not exist at all
// locally so we're skipping it
let rev = repo.rev_parse(&**branch).ok()?;
let branch_commits = rev
.single()
.unwrap()
.ancestors()
.sorting(Sorting::ByCommitTimeNewestFirstCutoffOlderThan {
seconds: start_date.timestamp(),
})
.all()
.ok()?;
branch_commits
.filter_map(|c| c.ok())
.filter_map(|c| c.object().ok())
.filter_map(|c| {
let title = c
.message()
.ok()?
.title
.trim_ascii()
.to_str()
.ok()?
.to_string();
if args.no_merges {
let is_merge = c.parent_ids().count() > 1;
if is_merge {
return None;
}
}
let author = c.author().ok()?;
let email = author.email.to_string();
let name = author.name.to_string();
let author = Author { name, email };
let author = mailmap.resolve(author);
if !authors.is_empty() && !authors.contains(&author.name) {
return None;
}
let time = c.time().ok()?;
let time = DateTime::from_timestamp_millis(time.seconds * 1000)?
.with_timezone(&Local);
if time < start_date || time > end_date {
return None;
}
Some(Commit {
id: c.id,
title,
author,
time,
})
})
.for_each(|c| {
repo_commits.push(c);
});
Some(repo_commits)
})
.reduce(Vec::new, |mut c, n| { .reduce(Vec::new, |mut c, n| {
c.extend(n); c.extend(n);
c c
}); });
let repo = repo.to_thread_local();
let branch_commits = branch_commits
.into_iter()
.unique()
.filter_map(|c| repo.find_commit(c).ok())
.filter_map(|c| {
let title = c
.message()
.ok()?
.title
.trim_ascii()
.to_str()
.ok()?
.to_string();
if args.no_merges {
let is_merge = c.parent_ids().count() > 1;
if is_merge {
return None;
}
}
let author = c.author().ok()?;
let email = author.email.to_string();
let name = author.name.to_string();
let author = Author { name, email };
let author = mailmap.resolve(author);
if !authors.is_empty() && !authors.contains(&author.name) {
return None;
}
let time = c.time().ok()?;
let time =
DateTime::from_timestamp_millis(time.seconds * 1000)?.with_timezone(&Local);
if time < start_date || time > end_date {
return None;
}
Some(Commit {
id: c.id,
title,
author,
time,
})
})
.collect_vec();
if !branch_commits.is_empty() { if !branch_commits.is_empty() {
repos_count += 1; repos_count += 1;
branches_count += branches.len(); branches_count += branches.len();
} }
// for vec in branch_commits {
commits.extend(branch_commits); commits.extend(branch_commits);
// }
} }
//NOTE: rayon on the entire repos set
// let dataset: Vec<(Vec<Commit>, usize, usize)> = repos
// .par_iter()
// .enumerate()
// .filter_map(|(i, repo_path)| {
// let mut repos_count: usize = 0;
// let mut branches_count: usize = 0;
// let mut repo_commits: Vec<Commit> = vec![];
//
// let repo = gix::open(repo_path).unwrap();
//
// let branch_names = &*branches[i];
// let mut branches: Vec<String> = vec![];
//
// if branch_names.is_empty() {
// branches = repo
// .references()
// .ok()?
// .prefixed("refs/heads")
// .ok()?
// .filter_map(Result::ok)
// .filter_map(|b| {
// b.inner
// .name
// .to_string()
// .strip_prefix("refs/heads/")
// .map(|s| s.to_string())
// })
// .collect_vec();
// }
// else {
// let branch_names = branch_names.split(' ').map(|s| s.to_string());
// branches.extend(branch_names);
// }
//
// let mailmap = Mailmap::new(repo_path);
// let mut has_commits = false;
//
// for branch in &branches {
// // When passing the default @ (HEAD) branch this might actually not exist at all
// // locally so we're skipping it
// let Ok(rev) = repo.rev_parse(&**branch)
// else {
// continue;
// };
//
// let branch_commits = rev
// .single()
// .unwrap()
// .ancestors()
// .sorting(Sorting::ByCommitTimeNewestFirstCutoffOlderThan {
// seconds: start_date.timestamp(),
// })
// .all()
// .ok()?;
//
// branch_commits
// .filter_map(|c| c.ok())
// .filter_map(|c| c.object().ok())
// .filter_map(|c| {
// let title = c
// .message()
// .ok()?
// .title
// .trim_ascii()
// .to_str()
// .ok()?
// .to_string();
//
// if args.no_merges {
// let is_merge = c.parent_ids().count() > 1;
// if is_merge {
// return None;
// }
// }
//
// let author = c.author().ok()?;
//
// let email = author.email.to_string();
// let name = author.name.to_string();
//
// let author = Author { name, email };
// let author = mailmap.resolve(author);
//
// if !authors.is_empty() && !authors.contains(&author.name) {
// return None;
// }
//
// let time = c.time().ok()?;
// let time = DateTime::from_timestamp_millis(time.seconds * 1000)?
// .with_timezone(&Local);
// if time < start_date || time > end_date {
// return None;
// }
//
// has_commits = true;
//
// Some(Commit {
// id: c.id,
// title,
// author,
// time,
// })
// })
// .for_each(|c| {
// repo_commits.push(c);
// });
// }
//
// if has_commits {
// repos_count += 1;
// branches_count += branches.len();
// }
//
// Some((repo_commits, repos_count, branches_count))
// })
// .collect();
//
// let mut repos_count = 0;
// let mut branches_count = 0;
// let mut repo_commits: Vec<Commit> = vec![];
//NOTE: simple for loop
// let dataset: (usize, &PathBuf) = repos.par_iter().enumerate().collect();
//
// for (i, repo_path) in dataset {
// let repo = gix::open(repo_path).unwrap();
//
// let branch_names = &*branches[i];
// let mut branches = vec![];
// if branch_names.is_empty() {
// branches = repo
// .references()?
// .prefixed("refs/heads")?
// .filter_map(Result::ok)
// .filter_map(|b| {
// b.inner
// .name
// .to_string()
// .strip_prefix("refs/heads/")
// .map(|s| s.to_string())
// })
// .collect_vec();
// }
// else {
// let branch_names = branch_names.split(' ').map(|s| s.to_string());
// branches.extend(branch_names);
// }
//
// let mailmap = Mailmap::new(repo_path);
// let mut has_commits = false;
//
// for branch in &branches {
// // When passing the default @ (HEAD) branch this might actually not exist at all
// // locally so we're skipping it
// let Ok(rev) = repo.rev_parse(&**branch)
// else {
// continue;
// };
//
// let branch_commits = rev
// .single()
// .unwrap()
// .ancestors()
// .sorting(Sorting::ByCommitTimeNewestFirstCutoffOlderThan {
// seconds: start_date.timestamp(),
// })
// .all()?;
//
// branch_commits
// .filter_map(|c| c.ok())
// .filter_map(|c| c.object().ok())
// .filter_map(|c| {
// let title = c
// .message()
// .ok()?
// .title
// .trim_ascii()
// .to_str()
// .ok()?
// .to_string();
//
// if args.no_merges {
// let is_merge = c.parent_ids().count() > 1;
// if is_merge {
// return None;
// }
// }
//
// let author = c.author().ok()?;
//
// let email = author.email.to_string();
// let name = author.name.to_string();
//
// let author = Author { name, email };
// let author = mailmap.resolve(author);
//
// if !authors.is_empty() && !authors.contains(&author.name) {
// return None;
// }
//
// let time = c.time().ok()?;
// let time =
// DateTime::from_timestamp_millis(time.seconds * 1000)?.with_timezone(&Local);
// if time < start_date || time > end_date {
// return None;
// }
//
// has_commits = true;
//
// Some(Commit {
// id: c.id,
// title,
// author,
// time,
// })
// })
// .for_each(|c| {
// commits.insert(c);
// });
// }
//
// if has_commits {
// repos_count += 1;
// branches_count += branches.len();
// }
// }
let mut commits: Vec<Commit> = commits.into_par_iter().collect::<Vec<Commit>>();
commits.par_sort_by_cached_key(|a| Reverse(a.time)); commits.par_sort_by_cached_key(|a| Reverse(a.time));
// .sorted_by_cached_key(|a| Reverse(a.time))
// .collect_vec();
Ok((repos_count, branches_count, commits)) Ok((repos_count, branches_count, commits))
} }
fn get_repo_branches(repo: &ThreadSafeRepository, branch_names: &str) -> Option<Vec<String>> {
if branch_names.is_empty() {
let repo = repo.to_thread_local();
let Ok(refs) = repo.references()
else {
return None;
};
let Ok(prefix) = refs.prefixed("refs/heads")
else {
return None;
};
let branches = prefix
.filter_map(Result::ok)
.filter_map(|b| {
b.inner
.name
.to_string()
.strip_prefix("refs/heads/")
.map(|s| s.to_string())
})
.collect();
Some(branches)
}
else {
Some(branch_names.split(' ').map(|s| s.to_string()).collect())
}
}
fn get_commit_ids(
repo: &ThreadSafeRepository,
branch: &str,
start_date: DateTime<Local>,
) -> Option<Vec<ObjectId>> {
let repo = repo.to_thread_local();
// When passing the default @ (HEAD) branch this might actually not exist at all
// locally so we're skipping it
let rev = repo.rev_parse(branch).ok()?;
let branch_commits = rev
.single()
.unwrap()
.ancestors()
.sorting(Sorting::ByCommitTimeNewestFirstCutoffOlderThan {
seconds: start_date.timestamp(),
})
.all()
.ok()?;
let commits = branch_commits
.filter_map(|c| c.ok())
.map(|c| c.id)
.collect();
Some(commits)
}
fn find_git_repos(scan_path: &path::Path, repos: &mut Vec<PathBuf>, ignored_repos: &Vec<String>) { fn find_git_repos(scan_path: &path::Path, repos: &mut Vec<PathBuf>, ignored_repos: &Vec<String>) {
if let Some(path) = walk_dir(scan_path, ignored_repos) { if let Some(path) = walk_dir(scan_path, ignored_repos) {
repos.extend(path) repos.extend(path)