Skip to content

Commit 01418d2

Browse files
authoredAug 27, 2024··
perf(index): lightweight structure (#57)
1 parent c294162 commit 01418d2

15 files changed

+407
-116
lines changed
 

‎Cargo.lock

+15-3
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

‎Cargo.toml

+2
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,12 @@
66
rio_api = '0.8.4'
77
rio_turtle = '0.8.4'
88
rstest = '0.21.0'
9+
serde_json = "1.0.127"
910
serde_yml = '0.0.10'
1011
slog = '2.7.0'
1112
slog-async = '2.8.0'
1213
slog-term = '2.9.0'
14+
smallvec = { version = "1.13.2", features = ["serde"] }
1315
tempfile = '3.10.1'
1416

1517
[dependencies.clap]

‎docs/development-guide.md

+9-1
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ just test
6666

6767
## Build the Package & Image
6868

69-
To build the package with Nix run:
69+
To build the package with Nix run:b
7070

7171
```shell
7272
just nix-package
@@ -110,3 +110,11 @@ It will:
110110
**Note: If the release pipeline fails, you can just run this same command again.
111111
Also rerun it when you made a mistake, it will cancel the current release (works
112112
also when `--amend`ing on the current commit)**
113+
114+
## Benchmarking performances
115+
116+
A benchmarking script is provided in `tools/bench/benchmark.sh`, along with a nix devshell. To run the benchmark in the isolated environment, run:
117+
118+
```shell
119+
just nix-develop-bench bash ./tools/bench/benchmark.sh
120+
```

‎justfile

+7
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,13 @@ nix-develop-ci *args:
2323
{ [ -n "${cmd:-}" ] || cmd=("zsh"); } && \
2424
nix develop ./tools/nix#ci --command "${cmd[@]}"
2525

26+
# Enter nix development shell for benchmarking.
27+
nix-develop-bench *args:
28+
cd "{{root_dir}}" && \
29+
cmd=("$@") && \
30+
{ [ -n "${cmd:-}" ] || cmd=("zsh"); } && \
31+
nix develop ./tools/nix#bench --command "${cmd[@]}"
32+
2633
## Standard stuff =============================================================
2734
# Format the code.
2835
format *args:

‎src/index.rs

+104-9
Original file line numberDiff line numberDiff line change
@@ -1,38 +1,133 @@
11
use rio_api::parser::TriplesParser;
22
use rio_turtle::TurtleError;
3-
use std::{io::Write, path::Path};
3+
use serde::{Deserialize, Serialize};
4+
use smallvec::{smallvec, SmallVec};
5+
use std::{
6+
collections::HashMap,
7+
hash::{DefaultHasher, Hash, Hasher},
8+
path::Path,
9+
};
410

511
use crate::{
612
io,
713
rdf_types::{Triple, TripleView},
814
};
915

10-
fn index_triple(t: Triple, out: &mut impl Write) {
16+
/// Stores a mapping from hashed instance uri to their types.
17+
/// The type URIs are stored once as a vector of strings.
18+
/// Each subject in map is stored as hash(subject_uri): u64
19+
/// and refers to its types using their vector index.
20+
#[derive(Serialize, Deserialize)]
21+
pub struct TypeIndex {
22+
pub types: Vec<String>,
23+
map: HashMap<u64, SmallVec<[usize; 1]>>,
24+
}
25+
26+
impl TypeIndex {
27+
fn hash(&self, s: &impl Hash) -> u64 {
28+
let mut hasher = DefaultHasher::new();
29+
s.hash(&mut hasher);
30+
hasher.finish().to_le()
31+
}
32+
33+
pub fn from_iter<'a>(type_map: impl Iterator<Item = (&'a str, &'a str)>) -> Self {
34+
let mut idx = TypeIndex::new();
35+
36+
type_map.for_each(|(subject_uri, type_uri)| idx.insert(subject_uri, type_uri).unwrap());
37+
38+
return idx;
39+
}
40+
41+
pub fn new() -> Self {
42+
TypeIndex {
43+
types: Vec::new(),
44+
map: HashMap::new(),
45+
}
46+
}
47+
48+
// Insert input subject-type mapping into the index.
49+
// The index will store the hash of the subject.
50+
pub fn insert(&mut self, subject_uri: &str, type_uri: &str) -> Result<(), std::io::Error> {
51+
let key = self.hash(&subject_uri.to_string());
52+
let type_idx: usize;
53+
54+
// Get type index or add a new one.
55+
if self.types.contains(&type_uri.to_string()) {
56+
type_idx = self.types.iter().position(|x| *x == type_uri).unwrap();
57+
} else {
58+
type_idx = self.types.len();
59+
self.types.push(type_uri.to_string());
60+
}
61+
// Insert mapping into the index.
62+
match self.map.get_mut(&key) {
63+
Some(v) => {
64+
v.push(type_idx);
65+
}
66+
None => {
67+
self.map.insert(key, smallvec![type_idx]);
68+
}
69+
}
70+
71+
Ok(())
72+
}
73+
74+
pub fn get(&self, subject_key: &str) -> Option<Vec<&str>> {
75+
let key = self.hash(&subject_key.to_string());
76+
self.map
77+
.get(&key)
78+
.map(|v| v.iter().map(|i| self.types[*i].as_ref()).collect())
79+
}
80+
}
81+
82+
fn index_triple(t: Triple, index: &mut TypeIndex) {
1183
if t.predicate.iri.as_str() == "http://www.w3.org/1999/02/22-rdf-syntax-ns#type" {
12-
let r = || -> std::io::Result<()> {
13-
out.write_all(t.to_string().as_bytes())?;
14-
out.write_all(b" .\n")
15-
}();
84+
let r = { index.insert(&t.subject.to_string(), &t.object.to_string()) };
1685

1786
if let Err(e) = r {
1887
panic!("Error writting to out buffer: {e}");
1988
}
2089
}
2190
}
2291

23-
pub fn create_type_map(input: &Path, output: &Path) {
92+
pub fn create_type_index(input: &Path, output: &Path) {
2493
let buf_in = io::get_reader(input);
25-
let mut buf_out = io::get_writer(output);
94+
let buf_out = io::get_writer(output);
2695
let mut triples = io::parse_ntriples(buf_in);
96+
let mut index = TypeIndex::new();
2797

2898
while !triples.is_end() {
2999
let _ = triples
30100
.parse_step(&mut |t: TripleView| {
31-
index_triple(t.into(), &mut buf_out);
101+
index_triple(t.into(), &mut index);
32102
Result::<(), TurtleError>::Ok(())
33103
})
34104
.inspect_err(|e| {
35105
panic!("Parsing error occured: {e}");
36106
});
37107
}
108+
let _ = serde_json::to_writer(buf_out, &index);
109+
}
110+
111+
#[cfg(test)]
112+
mod tests {
113+
use super::*;
114+
#[test]
115+
// Test the parsing of a triple.
116+
fn index_from_iter() {
117+
let vals = vec![
118+
("<urn:Alice>", "<urn:Person>"),
119+
("<urn:Alice>", "<urn:Employee>"),
120+
("<urn:ACME>", "<urn:Organization>"),
121+
]
122+
.into_iter()
123+
.map(|(a, b)| (a, b));
124+
125+
let idx = TypeIndex::from_iter(vals);
126+
127+
assert_eq!(
128+
idx.get("<urn:Alice>").unwrap(),
129+
vec!["<urn:Person>", "<urn:Employee>"]
130+
);
131+
println!("{}", serde_json::to_string(&idx).unwrap());
132+
}
38133
}

‎src/io.rs

+10-2
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
use crate::rules::Rules;
1+
use crate::{index::TypeIndex, rules::Rules};
22
use rio_turtle::NTriplesParser;
33
use std::{
44
fs::File,
@@ -46,7 +46,15 @@ pub fn parse_ntriples(reader: impl BufRead) -> NTriplesParser<impl BufRead> {
4646
pub fn parse_rules(path: &Path) -> Rules {
4747
return match File::open(path) {
4848
Ok(file) => serde_yml::from_reader(file).expect("Error parsing rules file."),
49-
Err(e) => panic!("Cannot open file '{:?}': '{}'.", path, e),
49+
Err(e) => panic!("Cannot open rules file '{:?}': '{}'.", path, e),
50+
};
51+
}
52+
53+
// Parse yaml type index
54+
pub fn parse_index(path: &Path) -> TypeIndex {
55+
return match File::open(path) {
56+
Ok(file) => serde_json::from_reader(file).expect("Error parsing index file."),
57+
Err(e) => panic!("Cannot open index file '{:?}': '{}'.", path, e),
5058
};
5159
}
5260

‎src/log.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ pub fn create_logger(use_stdout: bool) -> Arc<Logger> {
2020
.fuse();
2121

2222
let drain = slog_async::Async::new(drain)
23-
.chan_size(5_000_000)
23+
.chan_size(1_000)
2424
.build()
2525
.fuse();
2626

‎src/main.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ mod rules;
1010

1111
// Define the imports.
1212
use crate::{
13-
index::create_type_map,
13+
index::create_type_index,
1414
log::{create_logger, info},
1515
pseudo::pseudonymize_graph,
1616
};
@@ -87,7 +87,7 @@ fn main() {
8787
match cli.command {
8888
Subcommands::Index(args) => {
8989
info!(log, "Args: {:?}", args);
90-
create_type_map(&args.input, &args.output)
90+
create_type_index(&args.input, &args.output)
9191
}
9292
Subcommands::Pseudo(args) => {
9393
info!(log, "Args: {:?}", args);

‎src/pseudo.rs

+7-26
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
use rio_api::parser::TriplesParser;
22
use rio_turtle::TurtleError;
33
use std::{
4-
collections::HashMap,
5-
io::{BufRead, Write},
4+
io::Write,
65
path::{Path, PathBuf},
76
};
87

98
use crate::{
109
crypto::{new_pseudonymizer, Pseudonymize},
10+
index::TypeIndex,
1111
io,
1212
log::Logger,
1313
rdf_types::*,
@@ -19,7 +19,7 @@ use crate::{
1919
fn process_triple(
2020
triple: Triple,
2121
rules_config: &Rules,
22-
node_to_type: &HashMap<String, String>,
22+
node_to_type: &mut TypeIndex,
2323
out: &mut impl Write,
2424
hasher: &dyn Pseudonymize,
2525
) {
@@ -35,24 +35,6 @@ fn process_triple(
3535
}
3636
}
3737

38-
// Create a index mapping node -> type from an input ntriples buffer
39-
fn load_type_map(input: impl BufRead) -> HashMap<String, String> {
40-
let mut node_to_type: HashMap<String, String> = HashMap::new();
41-
let mut triples = io::parse_ntriples(input);
42-
43-
while !triples.is_end() {
44-
let _: Result<(), TurtleError> = triples.parse_step(&mut |t| {
45-
node_to_type.insert(
46-
t.subject.to_string().replace(['<', '>'], ""),
47-
t.object.to_string().replace(['<', '>'], ""),
48-
);
49-
Ok(())
50-
});
51-
}
52-
53-
return node_to_type;
54-
}
55-
5638
pub fn pseudonymize_graph(
5739
_: &Logger,
5840
input: &Path,
@@ -62,11 +44,10 @@ pub fn pseudonymize_graph(
6244
secret_path: &Option<PathBuf>,
6345
) {
6446
let buf_input = io::get_reader(input);
65-
let buf_index = io::get_reader(index_path);
6647
let mut buf_output = io::get_writer(output);
6748

6849
let rules = io::parse_rules(rules_path);
69-
let node_to_type: HashMap<String, String> = load_type_map(buf_index);
50+
let mut type_index = io::parse_index(index_path);
7051

7152
let secret = secret_path.as_ref().map(io::read_bytes);
7253
let pseudonymizer = new_pseudonymizer(None, secret);
@@ -80,7 +61,7 @@ pub fn pseudonymize_graph(
8061
process_triple(
8162
t.into(),
8263
&rules,
83-
&node_to_type,
64+
&mut type_index,
8465
&mut buf_output,
8566
&pseudonymizer,
8667
);
@@ -102,14 +83,14 @@ mod tests {
10283

10384
#[test]
10485
// Test the parsing of a triple.
105-
fn encrypt_nt_file() {
86+
fn pseudo_nt_file() {
10687
let logger = log::create_logger(true);
10788

10889
let dir = tempdir().unwrap();
10990
let input_path = Path::new("tests/data/test.nt");
11091
let rules_path = Path::new("tests/data/rules.yaml");
11192
let output_path = dir.path().join("output.nt");
112-
let type_map_path = Path::new("tests/data/type_map.nt");
93+
let type_map_path = Path::new("tests/data/type_index.json");
11394
let key = None;
11495
pseudonymize_graph(
11596
&logger,

‎src/rules.rs

+60-61
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ use crate::rdf_types::*;
22
use ::std::collections::{HashMap, HashSet};
33
use serde::{Deserialize, Serialize};
44

5-
use crate::model::TripleMask;
5+
use crate::{index::TypeIndex, model::TripleMask};
66

77
/// Rules for pseudonymizing nodes
88
#[derive(Serialize, Deserialize, Debug, Default)]
@@ -38,11 +38,7 @@ pub struct Rules {
3838
}
3939

4040
/// Check all parts of the triple against rules.
41-
pub fn match_rules(
42-
triple: &Triple,
43-
rules: &Rules,
44-
type_map: &HashMap<String, String>,
45-
) -> TripleMask {
41+
pub fn match_rules(triple: &Triple, rules: &Rules, type_map: &mut TypeIndex) -> TripleMask {
4642
let mut mask =
4743
match_node_rules(triple, rules, type_map) | match_object_rules(triple, rules, type_map);
4844

@@ -54,17 +50,13 @@ pub fn match_rules(
5450
}
5551

5652
/// Check triple against node-pseudonymization rules.
57-
pub fn match_node_rules(
58-
triple: &Triple,
59-
rules: &Rules,
60-
type_map: &HashMap<String, String>,
61-
) -> TripleMask {
53+
pub fn match_node_rules(triple: &Triple, rules: &Rules, type_map: &mut TypeIndex) -> TripleMask {
6254
let pseudo_subject = match &triple.subject {
63-
Subject::NamedNode(n) => match_type(&n.iri, rules, type_map),
55+
Subject::NamedNode(n) => match_type(&n.to_string(), rules, type_map),
6456
Subject::BlankNode(_) => false,
6557
};
6658
let pseudo_object = match &triple.object {
67-
Term::NamedNode(n) => match_type(&n.iri, rules, type_map),
59+
Term::NamedNode(n) => match_type(&n.to_string(), rules, type_map),
6860
Term::BlankNode(_) => false,
6961
Term::Literal(_) => false,
7062
};
@@ -81,22 +73,24 @@ pub fn match_node_rules(
8173
}
8274

8375
/// Checks triple against object-pseudonymization rules
84-
pub fn match_object_rules(
85-
triple: &Triple,
86-
rules: &Rules,
87-
type_map: &HashMap<String, String>,
88-
) -> TripleMask {
89-
if match_predicate(&triple.predicate.iri, rules) {
76+
pub fn match_object_rules(triple: &Triple, rules: &Rules, type_map: &mut TypeIndex) -> TripleMask {
77+
if match_predicate(&triple.predicate.to_string(), rules) {
9078
return TripleMask::OBJECT;
9179
}
9280

9381
let pseudo_object = match &triple.subject {
94-
Subject::NamedNode(n) => {
95-
match_type_predicate(&n.iri, &triple.predicate.iri, type_map, rules)
96-
}
97-
Subject::BlankNode(b) => {
98-
match_type_predicate(&b.id, &triple.predicate.iri, type_map, rules)
99-
}
82+
Subject::NamedNode(n) => match_type_predicate(
83+
&n.to_string(),
84+
&triple.predicate.to_string(),
85+
type_map,
86+
rules,
87+
),
88+
Subject::BlankNode(b) => match_type_predicate(
89+
&b.to_string(),
90+
&triple.predicate.to_string(),
91+
type_map,
92+
rules,
93+
),
10094
};
10195

10296
if pseudo_object {
@@ -107,9 +101,9 @@ pub fn match_object_rules(
107101
}
108102

109103
/// Check if the type of input instance URI is in the rules.
110-
fn match_type(subject: &str, rules: &Rules, type_map: &HashMap<String, String>) -> bool {
104+
fn match_type(subject: &str, rules: &Rules, type_map: &mut TypeIndex) -> bool {
111105
if let Some(v) = type_map.get(subject) {
112-
rules.nodes.of_type.contains(v)
106+
v.iter().any(|&i| rules.nodes.of_type.contains(i))
113107
} else {
114108
false
115109
}
@@ -124,19 +118,21 @@ fn match_predicate(predicate: &str, rules: &Rules) -> bool {
124118
fn match_type_predicate(
125119
subject: &str,
126120
predicate: &str,
127-
type_map: &HashMap<String, String>,
121+
type_map: &mut TypeIndex,
128122
rules: &Rules,
129123
) -> bool {
130-
let subject_type = match type_map.get(subject) {
131-
None => return false,
132-
Some(v) => v,
133-
};
134-
let preds = rules.objects.on_type_predicate.get(subject_type);
135-
if preds.is_none() || !preds.unwrap().contains(predicate) {
124+
let Some(instance_types) = type_map.get(subject) else {
136125
return false;
137-
}
126+
};
138127

139-
return true;
128+
for typ in instance_types {
129+
if let Some(preds) = rules.objects.on_type_predicate.get(typ) {
130+
if preds.contains(predicate) {
131+
return true;
132+
}
133+
}
134+
}
135+
return false;
140136
}
141137

142138
#[cfg(test)]
@@ -148,20 +144,21 @@ mod tests {
148144
use serde_yml;
149145

150146
// Instance used in tests
151-
const NODE_IRI: &str = "Alice";
152-
const PREDICATE_IRI: &str = "hasName";
147+
const NODE_IRI: &str = "<Alice>";
148+
const PREDICATE_IRI: &str = "<hasName>";
153149

154150
// Helper macro to create a HashMap from pairs
155151
#[macro_export]
156152
macro_rules! index {
157153
() => {
158-
::std::collections::HashMap::new()
154+
TypeIndex::new()
159155
};
160156

161157
($($key:expr => $value:expr),+ $(,)?) => {
162-
::std::collections::HashMap::from([
163-
$((String::from($key), String::from($value))),*
164-
])
158+
TypeIndex::from_iter(
159+
vec![
160+
$(($key, $value)),*
161+
].into_iter())
165162
};
166163
}
167164

@@ -171,13 +168,13 @@ mod tests {
171168

172169
#[rstest]
173170
// Subject is in the rules & type index
174-
#[case(index! { NODE_IRI => "Person" }, "Person", true)]
171+
#[case(index! { NODE_IRI => "<Person>" }, "<Person>", true)]
175172
// Subject is in the type index, not in the rules
176-
#[case(index! { NODE_IRI => "Person" }, "Bank", false)]
173+
#[case(index! { NODE_IRI => "<Person>" }, "<Bank>", false)]
177174
// Subject is not in the type index
178-
#[case(index! { "BankName" => "Bank" }, "Bank", false)]
175+
#[case(index! { "<BankName>" => "<Bank>" }, "<Bank>", false)]
179176
fn type_rule(
180-
#[case] index: HashMap<String, String>,
177+
#[case] mut index: TypeIndex,
181178
#[case] rule_type: &str,
182179
#[case] match_expected: bool,
183180
) {
@@ -189,7 +186,7 @@ mod tests {
189186
"
190187
));
191188

192-
assert_eq!(match_type(NODE_IRI, &rules, &index), match_expected);
189+
assert_eq!(match_type(NODE_IRI, &rules, &mut index), match_expected);
193190
}
194191

195192
#[rstest]
@@ -210,17 +207,17 @@ mod tests {
210207

211208
#[rstest]
212209
// Subject predicate in config
213-
#[case("Person", "hasName", index! { NODE_IRI => "Person" }, true)]
210+
#[case("<Person>", "<hasName>", index! { NODE_IRI => "<Person>" }, true)]
214211
// Subject in config, predicate not
215-
#[case("Person", "hasAge", index! { NODE_IRI => "Person" }, false)]
212+
#[case("<Person>", "<hasAge>", index! { NODE_IRI => "<Person>" }, false)]
216213
// Subject predicate not in config
217-
#[case("Bob", "hasAge", index! { NODE_IRI => "Person" }, false)]
214+
#[case("<Bob>", "<hasAge>", index! { NODE_IRI => "<Person>" }, false)]
218215
// Subject not in type index
219-
#[case("Bob", "hasAge", index! { "Bob" => "Person" }, false)]
216+
#[case("<Bob>", "<hasAge>", index! { "<Bob>" => "<Person>" }, false)]
220217
fn type_predicate_rule(
221218
#[case] rule_type: &str,
222219
#[case] rule_predicate: &str,
223-
#[case] index: HashMap<String, String>,
220+
#[case] mut index: TypeIndex,
224221
#[case] match_expected: bool,
225222
) {
226223
let rules = parse_rules(&format!(
@@ -233,7 +230,7 @@ mod tests {
233230
));
234231

235232
assert_eq!(
236-
match_type_predicate(NODE_IRI, PREDICATE_IRI, &index, &rules),
233+
match_type_predicate(NODE_IRI, PREDICATE_IRI, &mut index, &rules),
237234
match_expected
238235
);
239236
}
@@ -254,21 +251,23 @@ mod tests {
254251
let rules: Rules = parse_rules(
255252
r#"
256253
nodes:
257-
of_type: ["urn:Person"]
254+
of_type: ["<urn:Person>"]
258255
objects:
259-
on_predicate: ["urn:hasLastName"]
256+
on_predicate: ["<urn:hasLastName>"]
260257
on_type_predicate:
261-
"urn:Person": ["urn:hasAge"]
258+
"<urn:Person>": ["<urn:hasAge>"]
262259
"#,
263260
);
264-
let index = index! {
265-
"urn:Alice" => "urn:Person",
266-
"urn:Bob" => "urn:Person",
267-
"urn:ACME" => "urn:Organization"
261+
let mut index = index! {
262+
"<urn:Alice>" => "<urn:Person>",
263+
"<urn:Bob>" => "<urn:Person>",
264+
"<urn:ACME>" => "<urn:Organization>"
268265
};
266+
println!("{}", serde_yml::to_string(&rules).unwrap());
267+
println!("{}", serde_json::to_string(&index).unwrap());
269268
TurtleParser::new(triple.as_ref(), None)
270269
.parse_all(&mut |t| {
271-
let mask = match_rules(&t.into(), &rules, &index);
270+
let mask = match_rules(&t.into(), &rules, &mut index);
272271
assert_eq!(mask.bits(), expected_mask);
273272
Ok(()) as Result<(), TurtleError>
274273
})

‎tests/data/rules.yaml

+7-7
Original file line numberDiff line numberDiff line change
@@ -4,18 +4,18 @@ invert: false
44
# hash URIs of people and online accounts
55
nodes:
66
of_type:
7-
- "http://xmlns.com/foaf/0.1/Person" # All nodes which are rdf:type Person
8-
- "http://xmlns.com/foaf/OnlineAccount" # "" OnlineAccount
7+
- "<http://xmlns.com/foaf/0.1/Person>" # All nodes which are rdf:type Person
8+
- "<http://xmlns.com/foaf/OnlineAccount>" # "" OnlineAccount
99

1010
objects:
1111
# hash accesscode values for all nodes
1212
on_predicate:
13-
- "http://schema.org/accessCode"
13+
- "<http://schema.org/accessCode>"
1414
#on_type: # NOTE: not currently supported
1515
#- "http://example.org/UserAccount"
1616
# hash name only for instances of person and online account
1717
on_type_predicate:
18-
"http://xmlns.com/foaf/OnlineAccount":
19-
- "http://schema.org/name"
20-
"http://xmlns.com/foaf/0.1/Person":
21-
- "http://schema.org/name"
18+
"<http://xmlns.com/foaf/OnlineAccount>":
19+
- "<http://schema.org/name>"
20+
"<http://xmlns.com/foaf/0.1/Person>":
21+
- "<http://schema.org/name>"

‎tests/data/type_index.json

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"types":["<http://xmlns.com/foaf/0.1/Person>","<http://xmlns.com/foaf/OnlineAccount>","<http://xmlns.com/foaf/0.1/Organization>"],"map":{"15212815035200482759":[1],"130358124972442050":[0],"9932096721503705860":[1],"10729855068363610633":[2],"8283467020653172379":[0]}}

‎tests/data/type_map.nt

-4
This file was deleted.

‎tools/bench/benchmark.sh

+171
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
#!/usr/bin/env bash
2+
3+
# Benchmark runtime and memory usage of tripsu
4+
# Compares the working directory version against a baseline branch (main by default)
5+
6+
set -euo pipefail
7+
8+
### Final output path
9+
OUTPUT="profiling.md"
10+
PROFILE='release'
11+
BUILD_ARGS=( )
12+
[[ "${PROFILE}" == 'release' ]] && BUILD_ARGS+=( '--release' )
13+
### setup binaries
14+
15+
# baseline binary
16+
BASE_BRANCH='main'
17+
18+
BASE_DIR=$(mktemp -d)
19+
BASE_URL="$(git config --get remote.origin.url)"
20+
(
21+
GIT_CLONE_PROTECTION_ACTIVE=false \
22+
git clone \
23+
--branch "${BASE_BRANCH}" \
24+
"${BASE_URL}" \
25+
"${BASE_DIR}" \
26+
&& cd "${BASE_DIR}" \
27+
&& just build "${BUILD_ARGS[@]}"
28+
)
29+
BASE_BIN="${BASE_DIR}/target/${PROFILE}/tripsu"
30+
31+
# current binary
32+
COMP_BRANCH="$(git rev-parse --abbrev-ref HEAD)"
33+
just build "${BUILD_ARGS[@]}"
34+
COMP_BIN="./target/${PROFILE}/tripsu"
35+
36+
# setup data
37+
DATA_URL="https://ftp.uniprot.org/pub/databases/uniprot/current_release/rdf/proteomes.rdf.xz"
38+
INPUT="/tmp/proteomes.nt"
39+
40+
# Download data if needed
41+
if [ ! -f ${INPUT} ]; then
42+
curl "${DATA_URL}" \
43+
| xz -dc - \
44+
| rdfpipe-rs -i rdf-xml -o nt - \
45+
> "${INPUT}" || rm "${INPUT}"
46+
fi
47+
48+
# setup config
49+
RULES=$(mktemp)
50+
BASE_IDX=$(mktemp)
51+
COMP_IDX=$(mktemp)
52+
53+
cat << EOF > "${RULES}"
54+
55+
nodes:
56+
of_type:
57+
- "http://purl.uniprot.org/core/Proteome"
58+
- "http://purl.uniprot.org/core/Strain"
59+
60+
objects:
61+
on_type_predicate:
62+
"http://purl.uniprot.org/core/Submission_Citation":
63+
- "http://purl.uniprot.org/core/author"
64+
65+
on_predicate:
66+
- "http://purl.org/dc/terms/identifier"
67+
68+
EOF
69+
70+
### Commands to benchmark
71+
BASE_CMD_IDX="${BASE_BIN} index -o ${BASE_IDX} ${INPUT}"
72+
COMP_CMD_IDX="${COMP_BIN} index -o ${COMP_IDX} ${INPUT}"
73+
BASE_CMD_PSD="${BASE_BIN} pseudo -r ${RULES} -x ${BASE_IDX} ${INPUT}"
74+
COMP_CMD_PSD="${COMP_BIN} pseudo -r ${RULES} -x ${COMP_IDX} ${INPUT}"
75+
76+
### functions for profiling
77+
78+
cpu_prof() {
79+
local branch1=$1
80+
local cmd1=$2
81+
local branch2=$3
82+
local cmd2=$4
83+
local out=$5
84+
hyperfine --export-markdown "${out}" -r 5 \
85+
-n "${branch1}" "${cmd1}" \
86+
-n "${branch2}" "${cmd2}"
87+
}
88+
89+
mem_prof() {
90+
local name=$1
91+
local cmd=$2
92+
local heap_out
93+
heap_out=$(mktemp)
94+
echo -n "$name: "
95+
# shellcheck disable=SC2086
96+
heaptrack -o "${heap_out}" ${cmd} >/dev/null
97+
heaptrack_print "${heap_out}.zst" \
98+
| grep '^peak heap memory'
99+
}
100+
101+
make_report() {
102+
local cpu_index=$1
103+
local cpu_pseudo=$2
104+
local mem_index=$3
105+
local mem_pseudo=$4
106+
local base_branch=$5
107+
108+
cat <<-MD
109+
# tripsu profiling
110+
111+
> date: $(date -u +%Y-%m-%d)
112+
113+
Comparing $(git branch --show-current) against $base_branch.
114+
115+
## Timings
116+
117+
Run time compared using hyperfine
118+
119+
### Indexing
120+
121+
$(cat "${cpu_index}")
122+
123+
### Pseudonymization
124+
125+
$(cat "${cpu_pseudo}")
126+
127+
## Memory
128+
129+
Heap memory usage compared using heaptrack
130+
131+
### Indexing
132+
133+
$(cat "${mem_index}")
134+
135+
### Pseudonymization
136+
137+
$(cat "${mem_pseudo}")
138+
MD
139+
}
140+
141+
142+
### Run profiling
143+
144+
## Profile cpu time
145+
HYPF_IDX_OUT=$(mktemp)
146+
HYPF_PSD_OUT=$(mktemp)
147+
148+
# indexing
149+
cpu_prof "${BASE_BRANCH}" "${BASE_CMD_IDX}" \
150+
"${COMP_BRANCH}" "${COMP_CMD_IDX}" "${HYPF_IDX_OUT}"
151+
# pseudonymization
152+
cpu_prof "${BASE_BRANCH}" "${BASE_CMD_IDX}" \
153+
"${COMP_BRANCH}" "${COMP_CMD_IDX}" "${HYPF_PSD_OUT}"
154+
155+
## Profile memory
156+
HEAP_IDX_OUT=$(mktemp)
157+
HEAP_PSD_OUT=$(mktemp)
158+
159+
# indexing
160+
mem_prof "${BASE_BRANCH}" "${BASE_CMD_IDX}" > "${HEAP_IDX_OUT}"
161+
mem_prof "${COMP_BRANCH}" "${COMP_CMD_IDX}" >> "${HEAP_IDX_OUT}"
162+
# pseudonymization
163+
mem_prof "${BASE_BRANCH}" "${BASE_CMD_PSD}" > "${HEAP_PSD_OUT}"
164+
mem_prof "${COMP_BRANCH}" "${COMP_CMD_PSD}" >> "${HEAP_PSD_OUT}"
165+
166+
167+
### Reporting
168+
make_report \
169+
"${HYPF_IDX_OUT}" "${HYPF_PSD_OUT}" \
170+
"${HEAP_IDX_OUT}" "${HEAP_PSD_OUT}" \
171+
"${BASE_BRANCH}" > "${OUTPUT}"

‎tools/nix/flake.nix

+11
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,11 @@
8282
dasel
8383
];
8484

85+
benchInputs = with pkgs; [
86+
hyperfine
87+
heaptrack
88+
];
89+
8590
# Things needed at runtime.
8691
buildInputs = [];
8792

@@ -98,6 +103,12 @@
98103
inherit buildInputs;
99104
nativeBuildInputs = nativeBuildInputsBasic ++ nativeBuildInputsDev;
100105
};
106+
bench = mkShell {
107+
inherit buildInputs;
108+
nativeBuildInputs = nativeBuildInputsBasic
109+
++ nativeBuildInputsDev
110+
++ benchInputs;
111+
};
101112

102113
ci = mkShell {
103114
inherit buildInputs;

0 commit comments

Comments
 (0)
Please sign in to comment.