From 21e0247d7bf4803c5f90f7c08e275c53fee35ab6 Mon Sep 17 00:00:00 2001 From: Wynd Date: Sun, 19 Jan 2025 00:15:43 +0200 Subject: [PATCH] Some cleanup and parallel gix feature --- Cargo.lock | 11 ++ Cargo.toml | 5 +- src/lib.rs | 452 ++++++++++++++--------------------------------------- 3 files changed, 132 insertions(+), 336 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c27f429..f3211ea 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -234,6 +234,15 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "crossbeam-channel" +version = "0.5.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06ba6d68e24814cb8de6bb986db8222d3a027d15872cabc0d18817bc3c0e4471" +dependencies = [ + "crossbeam-utils", +] + [[package]] name = "crossbeam-deque" version = "0.8.6" @@ -536,12 +545,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ac7045ac9fe5f9c727f38799d002a7ed3583cd777e3322a7c4b43e3cf437dc69" dependencies = [ "crc32fast", + "crossbeam-channel", "flate2", "gix-hash", "gix-trace", "gix-utils", "libc", "once_cell", + "parking_lot", "prodash", "sha1_smol", "thiserror", diff --git a/Cargo.toml b/Cargo.toml index 2ea07e2..f703372 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,7 +21,10 @@ bench = false unsafe_code = { level = "forbid" } [dependencies] -gix = { version = "0.66.0", default-features = false, features = ["mailmap"] } +gix = { version = "0.66.0", default-features = false, features = [ + "mailmap", + "parallel", +] } clap = { version = "4.5.20", features = ["derive"] } chrono = { version = "0.4.38" } itertools = { version = "0.13.0" } diff --git a/src/lib.rs b/src/lib.rs index 778c213..e434745 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -2,7 +2,6 @@ use std::{ cmp::Reverse, - collections::HashSet, path::{self, PathBuf}, sync::OnceLock, }; @@ -103,7 +102,7 @@ pub fn get_commits( start_date: NaiveDate, end_date: NaiveDate, ) -> anyhow::Result<(usize, usize, Vec)> { - let mut commits: HashSet = HashSet::new(); + let mut commits: Vec = vec![]; let ignored_repos = args.ignored_repos.as_ref().unwrap_or(&vec![]).to_owned(); @@ -150,363 +149,146 @@ pub fn get_commits( let mut branches_count: usize = 0; for (i, repo_path) in repos.iter().enumerate() { - // let repo = ThreadSafeRepository::open(repo_path) - // .unwrap() - // .to_thread_local(); - let repo = gix::open(repo_path).unwrap(); + let repo = ThreadSafeRepository::open(repo_path).unwrap(); + // let repo = gix::open(repo_path).unwrap(); let branch_names = &*branches[i]; - let mut branches = vec![]; - if branch_names.is_empty() { - branches = repo - .references()? - .prefixed("refs/heads")? - .filter_map(Result::ok) - .filter_map(|b| { - b.inner - .name - .to_string() - .strip_prefix("refs/heads/") - .map(|s| s.to_string()) - }) - .collect_vec(); - } - else { - let branch_names = branch_names.split(' ').map(|s| s.to_string()); - branches.extend(branch_names); - } + let branches = get_repo_branches(&repo, branch_names).unwrap(); let mailmap = Mailmap::new(repo_path); - let branch_commits: Vec = branches + let branch_commits: Vec<_> = branches .par_iter() - .filter_map(|branch| { - let mut repo_commits: Vec = vec![]; - - let repo = ThreadSafeRepository::open(repo_path) - .unwrap() - .to_thread_local(); - - // When passing the default @ (HEAD) branch this might actually not exist at all - // locally so we're skipping it - let rev = repo.rev_parse(&**branch).ok()?; - - let branch_commits = rev - .single() - .unwrap() - .ancestors() - .sorting(Sorting::ByCommitTimeNewestFirstCutoffOlderThan { - seconds: start_date.timestamp(), - }) - .all() - .ok()?; - - branch_commits - .filter_map(|c| c.ok()) - .filter_map(|c| c.object().ok()) - .filter_map(|c| { - let title = c - .message() - .ok()? - .title - .trim_ascii() - .to_str() - .ok()? - .to_string(); - - if args.no_merges { - let is_merge = c.parent_ids().count() > 1; - if is_merge { - return None; - } - } - - let author = c.author().ok()?; - - let email = author.email.to_string(); - let name = author.name.to_string(); - - let author = Author { name, email }; - let author = mailmap.resolve(author); - - if !authors.is_empty() && !authors.contains(&author.name) { - return None; - } - - let time = c.time().ok()?; - let time = DateTime::from_timestamp_millis(time.seconds * 1000)? - .with_timezone(&Local); - if time < start_date || time > end_date { - return None; - } - - Some(Commit { - id: c.id, - title, - author, - time, - }) - }) - .for_each(|c| { - repo_commits.push(c); - }); - - Some(repo_commits) - }) + .filter_map(|branch| get_commit_ids(&repo, branch, start_date)) .reduce(Vec::new, |mut c, n| { c.extend(n); c }); + let repo = repo.to_thread_local(); + + let branch_commits = branch_commits + .into_iter() + .unique() + .filter_map(|c| repo.find_commit(c).ok()) + .filter_map(|c| { + let title = c + .message() + .ok()? + .title + .trim_ascii() + .to_str() + .ok()? + .to_string(); + + if args.no_merges { + let is_merge = c.parent_ids().count() > 1; + if is_merge { + return None; + } + } + + let author = c.author().ok()?; + + let email = author.email.to_string(); + let name = author.name.to_string(); + + let author = Author { name, email }; + let author = mailmap.resolve(author); + + if !authors.is_empty() && !authors.contains(&author.name) { + return None; + } + + let time = c.time().ok()?; + let time = + DateTime::from_timestamp_millis(time.seconds * 1000)?.with_timezone(&Local); + if time < start_date || time > end_date { + return None; + } + + Some(Commit { + id: c.id, + title, + author, + time, + }) + }) + .collect_vec(); + if !branch_commits.is_empty() { repos_count += 1; branches_count += branches.len(); } - // for vec in branch_commits { commits.extend(branch_commits); - // } } - //NOTE: rayon on the entire repos set - - // let dataset: Vec<(Vec, usize, usize)> = repos - // .par_iter() - // .enumerate() - // .filter_map(|(i, repo_path)| { - // let mut repos_count: usize = 0; - // let mut branches_count: usize = 0; - // let mut repo_commits: Vec = vec![]; - // - // let repo = gix::open(repo_path).unwrap(); - // - // let branch_names = &*branches[i]; - // let mut branches: Vec = vec![]; - // - // if branch_names.is_empty() { - // branches = repo - // .references() - // .ok()? - // .prefixed("refs/heads") - // .ok()? - // .filter_map(Result::ok) - // .filter_map(|b| { - // b.inner - // .name - // .to_string() - // .strip_prefix("refs/heads/") - // .map(|s| s.to_string()) - // }) - // .collect_vec(); - // } - // else { - // let branch_names = branch_names.split(' ').map(|s| s.to_string()); - // branches.extend(branch_names); - // } - // - // let mailmap = Mailmap::new(repo_path); - // let mut has_commits = false; - // - // for branch in &branches { - // // When passing the default @ (HEAD) branch this might actually not exist at all - // // locally so we're skipping it - // let Ok(rev) = repo.rev_parse(&**branch) - // else { - // continue; - // }; - // - // let branch_commits = rev - // .single() - // .unwrap() - // .ancestors() - // .sorting(Sorting::ByCommitTimeNewestFirstCutoffOlderThan { - // seconds: start_date.timestamp(), - // }) - // .all() - // .ok()?; - // - // branch_commits - // .filter_map(|c| c.ok()) - // .filter_map(|c| c.object().ok()) - // .filter_map(|c| { - // let title = c - // .message() - // .ok()? - // .title - // .trim_ascii() - // .to_str() - // .ok()? - // .to_string(); - // - // if args.no_merges { - // let is_merge = c.parent_ids().count() > 1; - // if is_merge { - // return None; - // } - // } - // - // let author = c.author().ok()?; - // - // let email = author.email.to_string(); - // let name = author.name.to_string(); - // - // let author = Author { name, email }; - // let author = mailmap.resolve(author); - // - // if !authors.is_empty() && !authors.contains(&author.name) { - // return None; - // } - // - // let time = c.time().ok()?; - // let time = DateTime::from_timestamp_millis(time.seconds * 1000)? - // .with_timezone(&Local); - // if time < start_date || time > end_date { - // return None; - // } - // - // has_commits = true; - // - // Some(Commit { - // id: c.id, - // title, - // author, - // time, - // }) - // }) - // .for_each(|c| { - // repo_commits.push(c); - // }); - // } - // - // if has_commits { - // repos_count += 1; - // branches_count += branches.len(); - // } - // - // Some((repo_commits, repos_count, branches_count)) - // }) - // .collect(); - // - // let mut repos_count = 0; - // let mut branches_count = 0; - // let mut repo_commits: Vec = vec![]; - - //NOTE: simple for loop - - // let dataset: (usize, &PathBuf) = repos.par_iter().enumerate().collect(); - // - // for (i, repo_path) in dataset { - // let repo = gix::open(repo_path).unwrap(); - // - // let branch_names = &*branches[i]; - // let mut branches = vec![]; - // if branch_names.is_empty() { - // branches = repo - // .references()? - // .prefixed("refs/heads")? - // .filter_map(Result::ok) - // .filter_map(|b| { - // b.inner - // .name - // .to_string() - // .strip_prefix("refs/heads/") - // .map(|s| s.to_string()) - // }) - // .collect_vec(); - // } - // else { - // let branch_names = branch_names.split(' ').map(|s| s.to_string()); - // branches.extend(branch_names); - // } - // - // let mailmap = Mailmap::new(repo_path); - // let mut has_commits = false; - // - // for branch in &branches { - // // When passing the default @ (HEAD) branch this might actually not exist at all - // // locally so we're skipping it - // let Ok(rev) = repo.rev_parse(&**branch) - // else { - // continue; - // }; - // - // let branch_commits = rev - // .single() - // .unwrap() - // .ancestors() - // .sorting(Sorting::ByCommitTimeNewestFirstCutoffOlderThan { - // seconds: start_date.timestamp(), - // }) - // .all()?; - // - // branch_commits - // .filter_map(|c| c.ok()) - // .filter_map(|c| c.object().ok()) - // .filter_map(|c| { - // let title = c - // .message() - // .ok()? - // .title - // .trim_ascii() - // .to_str() - // .ok()? - // .to_string(); - // - // if args.no_merges { - // let is_merge = c.parent_ids().count() > 1; - // if is_merge { - // return None; - // } - // } - // - // let author = c.author().ok()?; - // - // let email = author.email.to_string(); - // let name = author.name.to_string(); - // - // let author = Author { name, email }; - // let author = mailmap.resolve(author); - // - // if !authors.is_empty() && !authors.contains(&author.name) { - // return None; - // } - // - // let time = c.time().ok()?; - // let time = - // DateTime::from_timestamp_millis(time.seconds * 1000)?.with_timezone(&Local); - // if time < start_date || time > end_date { - // return None; - // } - // - // has_commits = true; - // - // Some(Commit { - // id: c.id, - // title, - // author, - // time, - // }) - // }) - // .for_each(|c| { - // commits.insert(c); - // }); - // } - // - // if has_commits { - // repos_count += 1; - // branches_count += branches.len(); - // } - // } - - let mut commits: Vec = commits.into_par_iter().collect::>(); - commits.par_sort_by_cached_key(|a| Reverse(a.time)); - // .sorted_by_cached_key(|a| Reverse(a.time)) - // .collect_vec(); Ok((repos_count, branches_count, commits)) } +fn get_repo_branches(repo: &ThreadSafeRepository, branch_names: &str) -> Option> { + if branch_names.is_empty() { + let repo = repo.to_thread_local(); + let Ok(refs) = repo.references() + else { + return None; + }; + + let Ok(prefix) = refs.prefixed("refs/heads") + else { + return None; + }; + + let branches = prefix + .filter_map(Result::ok) + .filter_map(|b| { + b.inner + .name + .to_string() + .strip_prefix("refs/heads/") + .map(|s| s.to_string()) + }) + .collect(); + + Some(branches) + } + else { + Some(branch_names.split(' ').map(|s| s.to_string()).collect()) + } +} + +fn get_commit_ids( + repo: &ThreadSafeRepository, + branch: &str, + start_date: DateTime, +) -> Option> { + let repo = repo.to_thread_local(); + + // When passing the default @ (HEAD) branch this might actually not exist at all + // locally so we're skipping it + let rev = repo.rev_parse(branch).ok()?; + + let branch_commits = rev + .single() + .unwrap() + .ancestors() + .sorting(Sorting::ByCommitTimeNewestFirstCutoffOlderThan { + seconds: start_date.timestamp(), + }) + .all() + .ok()?; + + let commits = branch_commits + .filter_map(|c| c.ok()) + .map(|c| c.id) + .collect(); + + Some(commits) +} + fn find_git_repos(scan_path: &path::Path, repos: &mut Vec, ignored_repos: &Vec) { if let Some(path) = walk_dir(scan_path, ignored_repos) { repos.extend(path)