sdsc-ordes · Aug 27, 2024
diff --git a/‎Cargo.lock
+15-3 b/‎Cargo.lock
+15-3
diff --git a/‎Cargo.toml
+2 b/‎Cargo.toml
+2
diff --git a/‎docs/development-guide.md
+9-1 b/‎docs/development-guide.md
+9-1
diff --git a/‎justfile
+7 b/‎justfile
+7
diff --git a/‎src/index.rs
+104-9 b/‎src/index.rs
+104-9
diff --git a/‎src/io.rs
+10-2 b/‎src/io.rs
+10-2
diff --git a/‎src/log.rs
+1-1 b/‎src/log.rs
+1-1
diff --git a/‎src/main.rs
+2-2 b/‎src/main.rs
+2-2
diff --git a/‎src/pseudo.rs
+7-26 b/‎src/pseudo.rs
+7-26
diff --git a/‎src/rules.rs
+60-61 b/‎src/rules.rs
+60-61
diff --git a/‎tests/data/rules.yaml
+7-7 b/‎tests/data/rules.yaml
+7-7
diff --git a/‎tests/data/type_index.json
+1 b/‎tests/data/type_index.json
+1
diff --git a/‎tests/data/type_map.nt
-4 b/‎tests/data/type_map.nt
-4
diff --git a/‎tools/bench/benchmark.sh
+171 b/‎tools/bench/benchmark.sh
+171
diff --git a/‎tools/nix/flake.nix
+11 b/‎tools/nix/flake.nix
+11
@@ -6,10 +6,12 @@
   rio_api = '0.8.4'
   rio_turtle = '0.8.4'
   rstest = '0.21.0'
+serde_json = "1.0.127"
   serde_yml = '0.0.10'
   slog = '2.7.0'
   slog-async = '2.8.0'
   slog-term = '2.9.0'
+smallvec = { version = "1.13.2", features = ["serde"] }
   tempfile = '3.10.1'
 
   [dependencies.clap]
 
@@ -66,7 +66,7 @@ just test
 
 ## Build the Package & Image
 
-To build the package with Nix run:
+To build the package with Nix run:b
 
 ```shell
 just nix-package
@@ -110,3 +110,11 @@ It will:
 **Note: If the release pipeline fails, you can just run this same command again.
 Also rerun it when you made a mistake, it will cancel the current release (works
 also when `--amend`ing on the current commit)**
+
+## Benchmarking performances
+
+A benchmarking script is provided in `tools/bench/benchmark.sh`, along with a nix devshell. To run the benchmark in the isolated environment, run:
+
+```shell
+just nix-develop-bench bash ./tools/bench/benchmark.sh
+```
@@ -23,6 +23,13 @@ nix-develop-ci *args:
     { [ -n "${cmd:-}" ] || cmd=("zsh"); } && \
     nix develop ./tools/nix#ci --command "${cmd[@]}"
 
+# Enter nix development shell for benchmarking.
+nix-develop-bench *args:
+    cd "{{root_dir}}" && \
+    cmd=("$@") && \
+    { [ -n "${cmd:-}" ] || cmd=("zsh"); } && \
+    nix develop ./tools/nix#bench --command "${cmd[@]}"
+
 ## Standard stuff =============================================================
 # Format the code.
 format *args:
 
@@ -1,38 +1,133 @@
 use rio_api::parser::TriplesParser;
 use rio_turtle::TurtleError;
-use std::{io::Write, path::Path};
+use serde::{Deserialize, Serialize};
+use smallvec::{smallvec, SmallVec};
+use std::{
+    collections::HashMap,
+    hash::{DefaultHasher, Hash, Hasher},
+    path::Path,
+};
 
 use crate::{
     io,
     rdf_types::{Triple, TripleView},
 };
 
-fn index_triple(t: Triple, out: &mut impl Write) {
+/// Stores a mapping from hashed instance uri to their types.
+/// The type URIs are stored once as a vector of strings.
+/// Each subject in map is stored as hash(subject_uri): u64
+/// and refers to its types using their vector index.
+#[derive(Serialize, Deserialize)]
+pub struct TypeIndex {
+    pub types: Vec<String>,
+    map: HashMap<u64, SmallVec<[usize; 1]>>,
+}
+
+impl TypeIndex {
+    fn hash(&self, s: &impl Hash) -> u64 {
+        let mut hasher = DefaultHasher::new();
+        s.hash(&mut hasher);
+        hasher.finish().to_le()
+    }
+
+    pub fn from_iter<'a>(type_map: impl Iterator<Item = (&'a str, &'a str)>) -> Self {
+        let mut idx = TypeIndex::new();
+
+        type_map.for_each(|(subject_uri, type_uri)| idx.insert(subject_uri, type_uri).unwrap());
+
+        return idx;
+    }
+
+    pub fn new() -> Self {
+        TypeIndex {
+            types: Vec::new(),
+            map: HashMap::new(),
+        }
+    }
+
+    // Insert input subject-type mapping into the index.
+    // The index will store the hash of the subject.
+    pub fn insert(&mut self, subject_uri: &str, type_uri: &str) -> Result<(), std::io::Error> {
+        let key = self.hash(&subject_uri.to_string());
+        let type_idx: usize;
+
+        // Get type index or add a new one.
+        if self.types.contains(&type_uri.to_string()) {
+            type_idx = self.types.iter().position(|x| *x == type_uri).unwrap();
+        } else {
+            type_idx = self.types.len();
+            self.types.push(type_uri.to_string());
+        }
+        // Insert mapping into the index.
+        match self.map.get_mut(&key) {
+            Some(v) => {
+                v.push(type_idx);
+            }
+            None => {
+                self.map.insert(key, smallvec![type_idx]);
+            }
+        }
+
+        Ok(())
+    }
+
+    pub fn get(&self, subject_key: &str) -> Option<Vec<&str>> {
+        let key = self.hash(&subject_key.to_string());
+        self.map
+            .get(&key)
+            .map(|v| v.iter().map(|i| self.types[*i].as_ref()).collect())
+    }
+}
+
+fn index_triple(t: Triple, index: &mut TypeIndex) {
     if t.predicate.iri.as_str() == "http://www.w3.org/1999/02/22-rdf-syntax-ns#type" {
-        let r = || -> std::io::Result<()> {
-            out.write_all(t.to_string().as_bytes())?;
-            out.write_all(b" .\n")
-        }();
+        let r = { index.insert(&t.subject.to_string(), &t.object.to_string()) };
 
         if let Err(e) = r {
             panic!("Error writting to out buffer: {e}");
         }
     }
 }
 
-pub fn create_type_map(input: &Path, output: &Path) {
+pub fn create_type_index(input: &Path, output: &Path) {
     let buf_in = io::get_reader(input);
-    let mut buf_out = io::get_writer(output);
+    let buf_out = io::get_writer(output);
     let mut triples = io::parse_ntriples(buf_in);
+    let mut index = TypeIndex::new();
 
     while !triples.is_end() {
         let _ = triples
             .parse_step(&mut |t: TripleView| {
-                index_triple(t.into(), &mut buf_out);
+                index_triple(t.into(), &mut index);
                 Result::<(), TurtleError>::Ok(())
             })
             .inspect_err(|e| {
                 panic!("Parsing error occured: {e}");
             });
     }
+    let _ = serde_json::to_writer(buf_out, &index);
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    #[test]
+    // Test the parsing of a triple.
+    fn index_from_iter() {
+        let vals = vec![
+            ("<urn:Alice>", "<urn:Person>"),
+            ("<urn:Alice>", "<urn:Employee>"),
+            ("<urn:ACME>", "<urn:Organization>"),
+        ]
+        .into_iter()
+        .map(|(a, b)| (a, b));
+
+        let idx = TypeIndex::from_iter(vals);
+
+        assert_eq!(
+            idx.get("<urn:Alice>").unwrap(),
+            vec!["<urn:Person>", "<urn:Employee>"]
+        );
+        println!("{}", serde_json::to_string(&idx).unwrap());
+    }
 }
@@ -1,4 +1,4 @@
-use crate::rules::Rules;
+use crate::{index::TypeIndex, rules::Rules};
 use rio_turtle::NTriplesParser;
 use std::{
     fs::File,
@@ -46,7 +46,15 @@ pub fn parse_ntriples(reader: impl BufRead) -> NTriplesParser<impl BufRead> {
 pub fn parse_rules(path: &Path) -> Rules {
     return match File::open(path) {
         Ok(file) => serde_yml::from_reader(file).expect("Error parsing rules file."),
-        Err(e) => panic!("Cannot open file '{:?}': '{}'.", path, e),
+        Err(e) => panic!("Cannot open rules file '{:?}': '{}'.", path, e),
+    };
+}
+
+// Parse yaml type index
+pub fn parse_index(path: &Path) -> TypeIndex {
+    return match File::open(path) {
+        Ok(file) => serde_json::from_reader(file).expect("Error parsing index file."),
+        Err(e) => panic!("Cannot open index file '{:?}': '{}'.", path, e),
     };
 }
 
 
@@ -20,7 +20,7 @@ pub fn create_logger(use_stdout: bool) -> Arc<Logger> {
         .fuse();
 
     let drain = slog_async::Async::new(drain)
-        .chan_size(5_000_000)
+        .chan_size(1_000)
         .build()
         .fuse();
 
 
@@ -10,7 +10,7 @@ mod rules;
 
 // Define the imports.
 use crate::{
-    index::create_type_map,
+    index::create_type_index,
     log::{create_logger, info},
     pseudo::pseudonymize_graph,
 };
@@ -87,7 +87,7 @@ fn main() {
     match cli.command {
         Subcommands::Index(args) => {
             info!(log, "Args: {:?}", args);
-            create_type_map(&args.input, &args.output)
+            create_type_index(&args.input, &args.output)
         }
         Subcommands::Pseudo(args) => {
             info!(log, "Args: {:?}", args);
 
@@ -1,13 +1,13 @@
 use rio_api::parser::TriplesParser;
 use rio_turtle::TurtleError;
 use std::{
-    collections::HashMap,
-    io::{BufRead, Write},
+    io::Write,
     path::{Path, PathBuf},
 };
 
 use crate::{
     crypto::{new_pseudonymizer, Pseudonymize},
+    index::TypeIndex,
     io,
     log::Logger,
     rdf_types::*,
@@ -19,7 +19,7 @@ use crate::{
 fn process_triple(
     triple: Triple,
     rules_config: &Rules,
-    node_to_type: &HashMap<String, String>,
+    node_to_type: &mut TypeIndex,
     out: &mut impl Write,
     hasher: &dyn Pseudonymize,
 ) {
@@ -35,24 +35,6 @@ fn process_triple(
     }
 }
 
-// Create a index mapping node -> type from an input ntriples buffer
-fn load_type_map(input: impl BufRead) -> HashMap<String, String> {
-    let mut node_to_type: HashMap<String, String> = HashMap::new();
-    let mut triples = io::parse_ntriples(input);
-
-    while !triples.is_end() {
-        let _: Result<(), TurtleError> = triples.parse_step(&mut |t| {
-            node_to_type.insert(
-                t.subject.to_string().replace(['<', '>'], ""),
-                t.object.to_string().replace(['<', '>'], ""),
-            );
-            Ok(())
-        });
-    }
-
-    return node_to_type;
-}
-
 pub fn pseudonymize_graph(
     _: &Logger,
     input: &Path,
@@ -62,11 +44,10 @@ pub fn pseudonymize_graph(
     secret_path: &Option<PathBuf>,
 ) {
     let buf_input = io::get_reader(input);
-    let buf_index = io::get_reader(index_path);
     let mut buf_output = io::get_writer(output);
 
     let rules = io::parse_rules(rules_path);
-    let node_to_type: HashMap<String, String> = load_type_map(buf_index);
+    let mut type_index = io::parse_index(index_path);
 
     let secret = secret_path.as_ref().map(io::read_bytes);
     let pseudonymizer = new_pseudonymizer(None, secret);
@@ -80,7 +61,7 @@ pub fn pseudonymize_graph(
                 process_triple(
                     t.into(),
                     &rules,
-                    &node_to_type,
+                    &mut type_index,
                     &mut buf_output,
                     &pseudonymizer,
                 );
@@ -102,14 +83,14 @@ mod tests {
 
     #[test]
     // Test the parsing of a triple.
-    fn encrypt_nt_file() {
+    fn pseudo_nt_file() {
         let logger = log::create_logger(true);
 
         let dir = tempdir().unwrap();
         let input_path = Path::new("tests/data/test.nt");
         let rules_path = Path::new("tests/data/rules.yaml");
         let output_path = dir.path().join("output.nt");
-        let type_map_path = Path::new("tests/data/type_map.nt");
+        let type_map_path = Path::new("tests/data/type_index.json");
         let key = None;
         pseudonymize_graph(
             &logger,
 
@@ -2,7 +2,7 @@ use crate::rdf_types::*;
 use ::std::collections::{HashMap, HashSet};
 use serde::{Deserialize, Serialize};
 
-use crate::model::TripleMask;
+use crate::{index::TypeIndex, model::TripleMask};
 
 /// Rules for pseudonymizing nodes
 #[derive(Serialize, Deserialize, Debug, Default)]
@@ -38,11 +38,7 @@ pub struct Rules {
 }
 
 /// Check all parts of the triple against rules.
-pub fn match_rules(
-    triple: &Triple,
-    rules: &Rules,
-    type_map: &HashMap<String, String>,
-) -> TripleMask {
+pub fn match_rules(triple: &Triple, rules: &Rules, type_map: &mut TypeIndex) -> TripleMask {
     let mut mask =
         match_node_rules(triple, rules, type_map) | match_object_rules(triple, rules, type_map);
 
@@ -54,17 +50,13 @@ pub fn match_rules(
 }
 
 /// Check triple against node-pseudonymization rules.
-pub fn match_node_rules(
-    triple: &Triple,
-    rules: &Rules,
-    type_map: &HashMap<String, String>,
-) -> TripleMask {
+pub fn match_node_rules(triple: &Triple, rules: &Rules, type_map: &mut TypeIndex) -> TripleMask {
     let pseudo_subject = match &triple.subject {
-        Subject::NamedNode(n) => match_type(&n.iri, rules, type_map),
+        Subject::NamedNode(n) => match_type(&n.to_string(), rules, type_map),
         Subject::BlankNode(_) => false,
     };
     let pseudo_object = match &triple.object {
-        Term::NamedNode(n) => match_type(&n.iri, rules, type_map),
+        Term::NamedNode(n) => match_type(&n.to_string(), rules, type_map),
         Term::BlankNode(_) => false,
         Term::Literal(_) => false,
     };
@@ -81,22 +73,24 @@ pub fn match_node_rules(
 }
 
 /// Checks triple against object-pseudonymization rules
-pub fn match_object_rules(
-    triple: &Triple,
-    rules: &Rules,
-    type_map: &HashMap<String, String>,
-) -> TripleMask {
-    if match_predicate(&triple.predicate.iri, rules) {
+pub fn match_object_rules(triple: &Triple, rules: &Rules, type_map: &mut TypeIndex) -> TripleMask {
+    if match_predicate(&triple.predicate.to_string(), rules) {
         return TripleMask::OBJECT;
     }
 
     let pseudo_object = match &triple.subject {
-        Subject::NamedNode(n) => {
-            match_type_predicate(&n.iri, &triple.predicate.iri, type_map, rules)
-        }
-        Subject::BlankNode(b) => {
-            match_type_predicate(&b.id, &triple.predicate.iri, type_map, rules)
-        }
+        Subject::NamedNode(n) => match_type_predicate(
+            &n.to_string(),
+            &triple.predicate.to_string(),
+            type_map,
+            rules,
+        ),
+        Subject::BlankNode(b) => match_type_predicate(
+            &b.to_string(),
+            &triple.predicate.to_string(),
+            type_map,
+            rules,
+        ),
     };
 
     if pseudo_object {
@@ -107,9 +101,9 @@ pub fn match_object_rules(
 }
 
 /// Check if the type of input instance URI is in the rules.
-fn match_type(subject: &str, rules: &Rules, type_map: &HashMap<String, String>) -> bool {
+fn match_type(subject: &str, rules: &Rules, type_map: &mut TypeIndex) -> bool {
     if let Some(v) = type_map.get(subject) {
-        rules.nodes.of_type.contains(v)
+        v.iter().any(|&i| rules.nodes.of_type.contains(i))
     } else {
         false
     }
@@ -124,19 +118,21 @@ fn match_predicate(predicate: &str, rules: &Rules) -> bool {
 fn match_type_predicate(
     subject: &str,
     predicate: &str,
-    type_map: &HashMap<String, String>,
+    type_map: &mut TypeIndex,
     rules: &Rules,
 ) -> bool {
-    let subject_type = match type_map.get(subject) {
-        None => return false,
-        Some(v) => v,
-    };
-    let preds = rules.objects.on_type_predicate.get(subject_type);
-    if preds.is_none() || !preds.unwrap().contains(predicate) {
+    let Some(instance_types) = type_map.get(subject) else {
         return false;
-    }
+    };
 
-    return true;
+    for typ in instance_types {
+        if let Some(preds) = rules.objects.on_type_predicate.get(typ) {
+            if preds.contains(predicate) {
+                return true;
+            }
+        }
+    }
+    return false;
 }
 
 #[cfg(test)]
@@ -148,20 +144,21 @@ mod tests {
     use serde_yml;
 
     // Instance used in tests
-    const NODE_IRI: &str = "Alice";
-    const PREDICATE_IRI: &str = "hasName";
+    const NODE_IRI: &str = "<Alice>";
+    const PREDICATE_IRI: &str = "<hasName>";
 
     // Helper macro to create a HashMap from pairs
     #[macro_export]
     macro_rules! index {
     () => {
-            ::std::collections::HashMap::new()
+            TypeIndex::new()
         };
 
         ($($key:expr => $value:expr),+ $(,)?) => {
-            ::std::collections::HashMap::from([
-                $((String::from($key), String::from($value))),*
-            ])
+            TypeIndex::from_iter(
+                vec![
+                $(($key, $value)),*
+            ].into_iter())
         };
     }
 
@@ -171,13 +168,13 @@ mod tests {
 
     #[rstest]
     // Subject is in the rules & type index
-    #[case(index! { NODE_IRI => "Person" }, "Person", true)]
+    #[case(index! { NODE_IRI => "<Person>" }, "<Person>", true)]
     // Subject is in the type index, not in the rules
-    #[case(index! { NODE_IRI => "Person" }, "Bank", false)]
+    #[case(index! { NODE_IRI => "<Person>" }, "<Bank>", false)]
     // Subject is not in the type index
-    #[case(index! { "BankName" => "Bank" }, "Bank", false)]
+    #[case(index! { "<BankName>" => "<Bank>" }, "<Bank>", false)]
     fn type_rule(
-        #[case] index: HashMap<String, String>,
+        #[case] mut index: TypeIndex,
         #[case] rule_type: &str,
         #[case] match_expected: bool,
     ) {
@@ -189,7 +186,7 @@ mod tests {
         "
         ));
 
-        assert_eq!(match_type(NODE_IRI, &rules, &index), match_expected);
+        assert_eq!(match_type(NODE_IRI, &rules, &mut index), match_expected);
     }
 
     #[rstest]
@@ -210,17 +207,17 @@ mod tests {
 
     #[rstest]
     // Subject predicate in config
-    #[case("Person", "hasName", index! { NODE_IRI => "Person" }, true)]
+    #[case("<Person>", "<hasName>", index! { NODE_IRI => "<Person>" }, true)]
     // Subject in config, predicate not
-    #[case("Person", "hasAge", index! { NODE_IRI => "Person" }, false)]
+    #[case("<Person>", "<hasAge>", index! { NODE_IRI => "<Person>" }, false)]
     // Subject predicate not in config
-    #[case("Bob", "hasAge", index! { NODE_IRI => "Person" }, false)]
+    #[case("<Bob>", "<hasAge>", index! { NODE_IRI => "<Person>" }, false)]
     // Subject not in type index
-    #[case("Bob", "hasAge", index! { "Bob" => "Person" }, false)]
+    #[case("<Bob>", "<hasAge>", index! { "<Bob>" => "<Person>" }, false)]
     fn type_predicate_rule(
         #[case] rule_type: &str,
         #[case] rule_predicate: &str,
-        #[case] index: HashMap<String, String>,
+        #[case] mut index: TypeIndex,
         #[case] match_expected: bool,
     ) {
         let rules = parse_rules(&format!(
@@ -233,7 +230,7 @@ mod tests {
         ));
 
         assert_eq!(
-            match_type_predicate(NODE_IRI, PREDICATE_IRI, &index, &rules),
+            match_type_predicate(NODE_IRI, PREDICATE_IRI, &mut index, &rules),
             match_expected
         );
     }
@@ -254,21 +251,23 @@ mod tests {
         let rules: Rules = parse_rules(
             r#"
             nodes:
-              of_type: ["urn:Person"]
+              of_type: ["<urn:Person>"]
             objects:
-              on_predicate: ["urn:hasLastName"]
+              on_predicate: ["<urn:hasLastName>"]
               on_type_predicate:
-                "urn:Person": ["urn:hasAge"]
+                "<urn:Person>": ["<urn:hasAge>"]
             "#,
         );
-        let index = index! {
-            "urn:Alice" => "urn:Person",
-            "urn:Bob" => "urn:Person",
-            "urn:ACME" => "urn:Organization"
+        let mut index = index! {
+            "<urn:Alice>" => "<urn:Person>",
+            "<urn:Bob>" => "<urn:Person>",
+            "<urn:ACME>" => "<urn:Organization>"
         };
+        println!("{}", serde_yml::to_string(&rules).unwrap());
+        println!("{}", serde_json::to_string(&index).unwrap());
         TurtleParser::new(triple.as_ref(), None)
             .parse_all(&mut |t| {
-                let mask = match_rules(&t.into(), &rules, &index);
+                let mask = match_rules(&t.into(), &rules, &mut index);
                 assert_eq!(mask.bits(), expected_mask);
                 Ok(()) as Result<(), TurtleError>
             })
 
@@ -4,18 +4,18 @@ invert: false
 # hash URIs of people and online accounts
 nodes:
   of_type:
-  - "http://xmlns.com/foaf/0.1/Person" # All nodes which are rdf:type Person
-  - "http://xmlns.com/foaf/OnlineAccount" # "" OnlineAccount
+  - "<http://xmlns.com/foaf/0.1/Person>" # All nodes which are rdf:type Person
+  - "<http://xmlns.com/foaf/OnlineAccount>" # "" OnlineAccount
 
 objects:
   # hash accesscode values for all nodes
   on_predicate:
-  - "http://schema.org/accessCode"
+  - "<http://schema.org/accessCode>"
   #on_type:  # NOTE: not currently supported
   #- "http://example.org/UserAccount"
   # hash name only for instances of person and online account
   on_type_predicate:
-    "http://xmlns.com/foaf/OnlineAccount":
-    - "http://schema.org/name"
-    "http://xmlns.com/foaf/0.1/Person":
-    - "http://schema.org/name"
+    "<http://xmlns.com/foaf/OnlineAccount>":
+    - "<http://schema.org/name>"
+    "<http://xmlns.com/foaf/0.1/Person>":
+    - "<http://schema.org/name>"
@@ -0,0 +1 @@
+{"types":["<http://xmlns.com/foaf/0.1/Person>","<http://xmlns.com/foaf/OnlineAccount>","<http://xmlns.com/foaf/0.1/Organization>"],"map":{"15212815035200482759":[1],"130358124972442050":[0],"9932096721503705860":[1],"10729855068363610633":[2],"8283467020653172379":[0]}}
@@ -0,0 +1,171 @@
+#!/usr/bin/env bash
+
+# Benchmark runtime and memory usage of tripsu
+# Compares the working directory version against a baseline branch (main by default)
+
+set -euo pipefail
+
+### Final output path
+OUTPUT="profiling.md"
+PROFILE='release'
+BUILD_ARGS=( )
+[[ "${PROFILE}" == 'release' ]] && BUILD_ARGS+=( '--release' )
+### setup binaries
+
+# baseline binary
+BASE_BRANCH='main'
+
+BASE_DIR=$(mktemp -d)
+BASE_URL="$(git config --get remote.origin.url)"
+(
+    GIT_CLONE_PROTECTION_ACTIVE=false \
+    git clone \
+        --branch "${BASE_BRANCH}" \
+        "${BASE_URL}" \
+        "${BASE_DIR}" \
+    && cd "${BASE_DIR}" \
+    && just build "${BUILD_ARGS[@]}"
+)
+BASE_BIN="${BASE_DIR}/target/${PROFILE}/tripsu"
+
+# current binary
+COMP_BRANCH="$(git rev-parse --abbrev-ref HEAD)"
+just build "${BUILD_ARGS[@]}"
+COMP_BIN="./target/${PROFILE}/tripsu"
+
+# setup data
+DATA_URL="https://ftp.uniprot.org/pub/databases/uniprot/current_release/rdf/proteomes.rdf.xz"
+INPUT="/tmp/proteomes.nt"
+
+# Download data if needed
+if [ ! -f ${INPUT} ]; then
+    curl "${DATA_URL}" \
+    | xz -dc -  \
+    | rdfpipe-rs -i rdf-xml -o nt - \
+    > "${INPUT}" || rm "${INPUT}"
+fi
+
+# setup config
+RULES=$(mktemp)
+BASE_IDX=$(mktemp)
+COMP_IDX=$(mktemp)
+
+cat << EOF > "${RULES}"
+
+nodes:
+  of_type:
+    - "http://purl.uniprot.org/core/Proteome"
+    - "http://purl.uniprot.org/core/Strain"
+
+objects:
+  on_type_predicate:
+    "http://purl.uniprot.org/core/Submission_Citation":
+      - "http://purl.uniprot.org/core/author"
+  
+  on_predicate:
+    - "http://purl.org/dc/terms/identifier"
+
+EOF
+
+### Commands to benchmark
+BASE_CMD_IDX="${BASE_BIN} index -o ${BASE_IDX} ${INPUT}"
+COMP_CMD_IDX="${COMP_BIN} index -o ${COMP_IDX} ${INPUT}"
+BASE_CMD_PSD="${BASE_BIN} pseudo -r ${RULES} -x ${BASE_IDX} ${INPUT}"
+COMP_CMD_PSD="${COMP_BIN} pseudo -r ${RULES} -x ${COMP_IDX} ${INPUT}"
+
+### functions for profiling
+
+cpu_prof() {
+    local branch1=$1
+    local cmd1=$2
+    local branch2=$3
+    local cmd2=$4
+    local out=$5
+    hyperfine --export-markdown "${out}" -r 5 \
+        -n "${branch1}" "${cmd1}" \
+        -n "${branch2}" "${cmd2}"
+}
+
+mem_prof() {
+    local name=$1
+    local cmd=$2
+    local heap_out
+    heap_out=$(mktemp)
+    echo -n "$name: "
+    # shellcheck disable=SC2086
+    heaptrack -o "${heap_out}" ${cmd} >/dev/null
+    heaptrack_print "${heap_out}.zst" \
+    | grep '^peak heap memory'
+}
+
+make_report() {
+    local cpu_index=$1
+    local cpu_pseudo=$2
+    local mem_index=$3
+    local mem_pseudo=$4
+    local base_branch=$5
+
+    cat <<-MD
+	# tripsu profiling
+
+	> date: $(date -u +%Y-%m-%d)
+
+    Comparing $(git branch --show-current) against $base_branch.
+	
+	## Timings
+	
+	Run time compared using hyperfine
+	
+	### Indexing
+	
+	$(cat "${cpu_index}")
+	
+	### Pseudonymization
+	
+	$(cat "${cpu_pseudo}")
+	
+	## Memory
+	
+	Heap memory usage compared using heaptrack
+	
+	### Indexing
+	
+	$(cat "${mem_index}")
+	
+	### Pseudonymization
+	
+	$(cat "${mem_pseudo}")
+	MD
+}
+
+
+###  Run profiling
+
+## Profile cpu time
+HYPF_IDX_OUT=$(mktemp)
+HYPF_PSD_OUT=$(mktemp)
+
+# indexing
+cpu_prof "${BASE_BRANCH}" "${BASE_CMD_IDX}" \
+         "${COMP_BRANCH}" "${COMP_CMD_IDX}" "${HYPF_IDX_OUT}"
+# pseudonymization
+cpu_prof "${BASE_BRANCH}" "${BASE_CMD_IDX}" \
+         "${COMP_BRANCH}" "${COMP_CMD_IDX}" "${HYPF_PSD_OUT}"
+
+## Profile memory
+HEAP_IDX_OUT=$(mktemp)
+HEAP_PSD_OUT=$(mktemp)
+
+# indexing
+mem_prof "${BASE_BRANCH}" "${BASE_CMD_IDX}" >  "${HEAP_IDX_OUT}"
+mem_prof "${COMP_BRANCH}" "${COMP_CMD_IDX}" >> "${HEAP_IDX_OUT}"
+# pseudonymization
+mem_prof "${BASE_BRANCH}" "${BASE_CMD_PSD}" >  "${HEAP_PSD_OUT}"
+mem_prof "${COMP_BRANCH}" "${COMP_CMD_PSD}" >> "${HEAP_PSD_OUT}"
+
+
+### Reporting
+make_report \
+    "${HYPF_IDX_OUT}" "${HYPF_PSD_OUT}" \
+    "${HEAP_IDX_OUT}" "${HEAP_PSD_OUT}" \
+    "${BASE_BRANCH}" > "${OUTPUT}"
@@ -82,6 +82,11 @@
           dasel
         ];
 
+        benchInputs = with pkgs; [
+          hyperfine
+          heaptrack
+        ];
+
         # Things needed at runtime.
         buildInputs = [];
 
@@ -98,6 +103,12 @@
               inherit buildInputs;
               nativeBuildInputs = nativeBuildInputsBasic ++ nativeBuildInputsDev;
             };
+            bench = mkShell {
+              inherit buildInputs;
+              nativeBuildInputs = nativeBuildInputsBasic 
+                ++ nativeBuildInputsDev
+                ++ benchInputs;
+            };
 
             ci = mkShell {
               inherit buildInputs;
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+{"types":["<http://xmlns.com/foaf/0.1/Person>","<http://xmlns.com/foaf/OnlineAccount>","<http://xmlns.com/foaf/0.1/Organization>"],"map":{"15212815035200482759":[1],"130358124972442050":[0],"9932096721503705860":[1],"10729855068363610633":[2],"8283467020653172379":[0]}}`