Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merge 00X-controlfields (#142) #150

Merged
merged 1 commit into from
Feb 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
487 changes: 251 additions & 236 deletions package-lock.json

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
"url": "[email protected]:natlibfi/melinda-marc-record-merge-reducers-js.git"
},
"license": "LGPL-3.0+",
"version": "2.0.25",
"version": "2.1.0-alpha.1",
"main": "./dist/index.js",
"engines": {
"node": ">=18"
Expand All @@ -39,10 +39,10 @@
"@natlibfi/marc-record": "^8.1.0",
"@natlibfi/marc-record-merge": "^7.0.2",
"@natlibfi/marc-record-validate": "^8.0.6",
"@natlibfi/marc-record-validators-melinda": "^10.15.6",
"@natlibfi/marc-record-validators-melinda": "^10.16.0",
"@natlibfi/melinda-commons": "^13.0.12",
"debug": "^4.3.4",
"isbn3": "^1.1.44",
"isbn3": "^1.1.45",
"normalize-diacritics": "^4.0.3"
},
"devDependencies": {
Expand All @@ -59,7 +59,7 @@
"chai": "^4.4.1",
"cross-env": "^7.0.3",
"eslint": "^8.56.0",
"mocha": "^10.2.0",
"mocha": "^10.3.0",
"nodemon": "^3.0.3",
"nyc": "^15.1.0"
},
Expand Down
115 changes: 54 additions & 61 deletions src/reducers/controlFieldUtils.js
Original file line number Diff line number Diff line change
@@ -1,31 +1,13 @@
import {nvdebug} from './utils';
import createDebugLogger from 'debug';
//import {nvdebug} from './utils';
//import createDebugLogger from 'debug';

const debug = createDebugLogger('@natlibfi/melinda-marc-record-merge-reducers:controlFieldUtils');
//const debug = createDebugLogger('@natlibfi/melinda-marc-record-merge-reducers:controlFieldUtils');
//const debugData = debug.extend('data');
const debugDev = debug.extend('dev');

function fieldPositionValueContainsInformation(val) {
if (val === '' || val === '|' || val === ' ' || val === '#') {
return false;
}
return true;
}

function getBetterControlFieldPositionValue(c1, c2) {
if (fieldPositionValueContainsInformation(c1)) {
return c1;
}
if (fieldPositionValueContainsInformation(c2)) {
return c2;
}
return c1;
}

//const debugDev = debug.extend('dev');

const f007Lengths = {a: 8, c: 14, d: 6, f: 10, g: 9, h: 13, k: 6, m: 23, o: 2, q: 2, r: 11, s: 14, t: 2, v: 9, z: 2};

function hasLegalLength(field) {
export function hasLegalLength(field) {
if (field.tag === '006') {
return field.value.length === 18;
}
Expand All @@ -34,62 +16,73 @@ function hasLegalLength(field) {
if (field.tag === '007') {
const c0 = field.value.charAt(0);
if (c0 in f007Lengths) {
nvdebug(`${c0}: COMPARE ${f007Lengths[c0]} vs ${field.value.length}`, debugDev);
//nvdebug(`${c0}: COMPARE ${f007Lengths[c0]} vs ${field.value.length}`, debugDev);
return field.value.length === f007Lengths[c0];
}

return false;
return false; // Sanity check. It's ok that no test reaches this poin.
}

if (field.tag === '008') {
return field.value.length === 40;
}

return false;
return false; // Again: a sanity check. No test should reach this point.
}

export function isFillableControlFieldPair(baseField, sourceField) {
if (baseField.value.length !== sourceField.value.length) {
return false;
}
if (!hasLegalLength(baseField)) {
return false;
}

if (baseField.tag === '006' && baseField.value[0] !== sourceField.value[0]) {
return false;
export function genericControlFieldCharPosFix(baseField, sourceField, baseTypeOfMaterial, sourceTypeOfMaterial, rule) { // eslint-disable-line max-params
// Initially written fro field 008, but may be applied to 006 and 007 as well (I guess).
// We apply some rules (eg. for government publication) even if baseTypeOfMaterial !== sourceTypeOfMaterial
if (!rule.types.includes(baseTypeOfMaterial) || !rule.types.includes(sourceTypeOfMaterial) || rule.validateOnly) {
return;
}
//console.info(`Apply ${'description' in rule ? rule.description : 'nameless'} rule`); // eslint-disable-line no-console
const legalValues = rule.prioritizedValues;
const position = baseField.tag === '006' ? rule.startPosition - 17 : rule.startPosition; // Field 006 uses rules writted for field 008. 006/01=008/18 etc.
const valueForUnknown = 'valueForUnknown' in rule ? rule.valueForUnknown : undefined;
const [noAttemptToCode] = rule.noAttemptToCode;

if (baseField.tag === '007') {
// 007/00 values must be equal:
if (baseField.value.charAt(0) !== sourceField.value.charAt(0)) {
return false;
}

// 007/01 values must match or contain '|' (undefined):
if (baseField.value.charAt(1) === sourceField.value.charAt(1) || sourceField.value.charAt(1) === '|' || baseField.value.charAt(1) === '|') {
return true;
}
}
const len = legalValues.length > 0 ? legalValues[0].length : noAttemptToCode.length;

const arr1 = baseField.value.split('');
const arr2 = sourceField.value.split('');
if (arr1.every((c, i) => c === arr2[i] || !fieldPositionValueContainsInformation(c) || !fieldPositionValueContainsInformation(arr2[i]))) {
return true;
}
return false;
}
const baseValue = baseField.value.substring(position, position + len);
const sourceValue = sourceField.value.substring(position, position + len);

export function fillControlFieldGaps(baseField, sourceField, min = 0, max = 39) {
// NB! Mergability must be checked before calling this!
//console.info(`${position}: '${baseValue}' vs '${sourceValue}', UNKNOWN: '${valueForUnknown}', type of material: ${typeOfMaterial}`); // eslint-disable-line no-console
//console.info(`Consider ${'description' in rule ? rule.description : 'unnamed'} rule at ${rule.startPosition}:\n'${fieldToString(baseField)}' +\n'${fieldToString(sourceField)}' =`); // eslint-disable-line no-console

if (baseField.value.length !== sourceField.value.length) {
if (applyFix()) {
//console.info(`Apply ${'description' in rule ? rule.description : 'unnamed'} rule at ${rule.startPosition}:\n'${fieldToString(baseField)}' +\n'${fieldToString(sourceField)}' =`); // eslint-disable-line no-console
baseField.value = `${baseField.value.substring(0, position)}${sourceValue}${baseField.value.substring(position + len)}`; // eslint-disable-line functional/immutable-data
//console.info(`'${fieldToString(baseField)}'`); // eslint-disable-line no-console
return;
}
const arr1 = baseField.value.split('');
const arr2 = sourceField.value.split('');
return;

const mergedCharArray = arr1.map((c, i) => i < min || i > max ? c : getBetterControlFieldPositionValue(c, arr2[i]));
function applyFix() {
if (baseValue === sourceValue || legalValues.includes(baseValue)) {
return false;
}
if (legalValues.includes(sourceValue)) {
return true;
}
if (valueForUnknown) {
if (baseValue === valueForUnknown) {
return false;
}
if (sourceValue === valueForUnknown) {
return true;
}
}
if (noAttemptToCode) {
if (baseValue === noAttemptToCode) {
return false;
}
if (sourceValue === noAttemptToCode) {
return true;
}
}
//console.info(`DEFAULT:don't apply fix for ${baseValue} vs ${sourceValue}`); // eslint-disable-line no-console
return false;
}

baseField.value = mergedCharArray.join(''); // eslint-disable-line functional/immutable-data
}
4 changes: 4 additions & 0 deletions src/reducers/controlSubfields.js
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@
return sourceSubfield.value === subfieldValue;
}

function keepOrDropPreventsMerge() {

Check warning on line 163 in src/reducers/controlSubfields.js

View workflow job for this annotation

GitHub Actions / Node version matrix (16.x)

Function 'keepOrDropPreventsMerge' has too many statements (22). Maximum allowed is 20

Check warning on line 163 in src/reducers/controlSubfields.js

View workflow job for this annotation

GitHub Actions / Node version matrix (18.x)

Function 'keepOrDropPreventsMerge' has too many statements (22). Maximum allowed is 20

Check warning on line 163 in src/reducers/controlSubfields.js

View workflow job for this annotation

GitHub Actions / Node version matrix (20.x)

Function 'keepOrDropPreventsMerge' has too many statements (22). Maximum allowed is 20
const keepOrDrop1 = baseFieldSubfields9.filter(sf => subfieldHasKeepOrDrop(sf));
const keepOrDrop2 = sourceFieldSubfields9.filter(sf => subfieldHasKeepOrDrop(sf));

Expand All @@ -168,6 +168,10 @@
return false;
}

if (baseField.tag.charAt(0) === '1' && !keepOrDrop2.some(sf => (/<DROP>/u).test(sf.value))) {
return false;
}

const sf9lessField1 = baseField.subfields.filter(subfield => retainSubfieldForKeepComparison(subfield));
const sf9lessField2 = sourceField.subfields.filter(subfield => retainSubfieldForKeepComparison(subfield));

Expand Down
162 changes: 152 additions & 10 deletions src/reducers/field006.js
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import createDebugLogger from 'debug';
import {MarcRecord} from '@natlibfi/marc-record';
import {copyFields, nvdebug} from './utils.js';
import {fillControlFieldGaps, isFillableControlFieldPair} from './controlFieldUtils.js';

import {genericControlFieldCharPosFix, hasLegalLength} from './controlFieldUtils.js';
import {getSingleCharacterPositionRules, isSpecificLiteraryForm, setFormOfItem, setLiteraryForm} from './field008.js';
// Test 02: If Leader 000/06 is 'o' or 'p' in source, copy 006 from source to base as new field (2x)
// Test 03: If Leader 000/06 is something else, do nothing

Expand All @@ -21,12 +21,10 @@ export default () => (base, source) => {
const baseFields = baseRecord.get(/^006$/u);
const sourceFields = sourceRecord.get(/^006$/u);

// If both sides have same number of entries,
// and they apparently are in the same order,
// let's try to fill the gaps:
// If both sides have same number of entries, and they apparently are in the same order, let's try to fill the gaps:
if (baseFields.length > 0 && baseFields.length === sourceFields.length) {
if (baseFields.every((baseField, i) => isFillableControlFieldPair(baseField, sourceFields[i]))) { // eslint-disable-line functional/no-conditional-statements
baseFields.forEach((baseField, i) => fillControlFieldGaps(baseField, sourceFields[i]));
if (baseFields.every((baseField, i) => areMergable006Pair(baseField, sourceFields[i]))) { // eslint-disable-line functional/no-conditional-statements
baseFields.forEach((baseField, i) => fillField006Gaps(baseField, sourceFields[i]));
}
return {base: baseRecord, source};
}
Expand All @@ -38,8 +36,152 @@ export default () => (base, source) => {
return {base: baseRecord, source};
}

// Defy specs: don't copy non-identical fields. Typically we should have only one 007 field.
// And don't merge them either, as it is too risky. Let's just trust base record.
// Defy specs: don't copy non-identical fields. Typically (but not always) we should have only one 006 field.
// Default behaviour: merging is too risky (might describe different materials), so let's just trust base record.
return {base: baseRecord, source};

};

const singleCharacterPositionRules = getSingleCharacterPositionRules();

function fillField006Gaps(baseField, sourceField) {
if (!hasLegalLength(baseField) && hasLegalLength(sourceField)) {
baseField.value = sourceField.value; // eslint-disable-line functional/immutable-data
return;
}
const typeOfMaterial = mapFieldToTypeOfMaterial(baseField);
singleCharacterPositionRules.forEach(rule => mergeTwo006Fields(baseField, sourceField, typeOfMaterial, rule));
setFormOfItem(baseField, sourceField, typeOfMaterial, typeOfMaterial);
setLiteraryForm(baseField, sourceField, typeOfMaterial, typeOfMaterial);
//console.info(`FINAL:\n${fieldToString(baseField)}`); // eslint-disable-line no-console
}

function mergeTwo006Fields(baseField, sourceField, typeOfMaterial, rule) {
//console.info(`Apply ${'description' in rule ? rule.description : 'unnamed'} rule at ${rule.startPosition}:\n'${fieldToString(baseField)}' +\n'${fieldToString(sourceField)}' =`); // eslint-disable-line no-console
genericControlFieldCharPosFix(baseField, sourceField, typeOfMaterial, typeOfMaterial, rule);
//console.info(`'${fieldToString(baseField)}'`); // eslint-disable-line no-console
}

function areMergable006Pair(field1, field2) {
// NB! We explicitly assume that only tag=006 stuff gets this far!
// Check 006/00:
if (field1.value[0] !== field2.value[0] || !hasLegalLength(field2)) {
return false;
}
const typeOfMaterial = mapFieldToTypeOfMaterial(field1);
if (!typeOfMaterial) { // Must map to some type of material
return false;
}

if (!hasLegalLength(field1)) {
return true; // If base has illegal size, use source...
}

if (field1.value.length !== field2.value.length) {
return false;
}
if (!hasLegalLength(field1)) {
return false;
}
// By default, we try to merge 008/18-34. However we are much stricter with 006 pairs, as we can not be sure they mean the same thing...
// (There is always one 008, but 006 has 0...n instances.) Thus this does not allow any subsetting etc of, say, BK 006/07-10.
// We should improve order stuff etc., but let's start with overstrict implementation, as the problem is largely theoretical.
// The proper solution will eventually be done in field008.js. We can then decide whether we can to use it in 006 as well.

const arr1 = field1.value.split('');
const arr2 = field2.value.split('');
if (arr1.every((c, i) => c === arr2[i] || !field006PositionValueContainsInformation(c, i) || !field006PositionValueContainsInformation(arr2[i], i) || isException(c, arr2[i], i))) {
return true;
}

return false;

function isException(c1, c2, characterPosition) {
// (NB! We know that c1/c2 at character position means the same for both (type of record is always same) as base 006/00 must be source 006/00)
if (characterPosition === 6) {
// 'o' (online resource)and 'q' are subsets of 'p'
if (['BK', 'CR', 'MU', 'MX'].includes(typeOfMaterial)) {
if (['o', 'q'].includes(c1) && c2 === 's') {
return true;
}
if (['o', 'q'].includes(c2) && c1 === 's') {
return true;
}
}
}

if (characterPosition === 16 && typeOfMaterial === 'BK') {
if (c1 === '1' && isSpecificLiteraryForm(c2)) {
return 1;
}
if (c2 === '1' && isSpecificLiteraryForm(c1)) {
return 1;
}
}
return false;
}

function field006PositionValueContainsInformation(c, position) {
//console.info(`006/${position}: '${c}' (${typeOfMaterial})`); // eslint-disable-line no-console
if (c === '|') {
return false;
}

if (c === ' ') { // Typically false, but there are some notable exceptions:
return spaceContainsInformation(position);
}

// Compare variable c against relevant rule.valueForUnknown values (NB! We should implement similar rule for field 006):
const relevantRules = singleCharacterPositionRules.filter(rule => rule.types.includes(typeOfMaterial) && rule.startPosition - 17 === position);
if (relevantRules.length === 0) { // Not interested
return false;
}
if (relevantRules.some(rule => 'valueForUnknown' in rule && rule.valueForUnknown === c)) {
return false;
}

return true;
}

function spaceContainsInformation(position) {
// All/some of these should be checked via rules...
if (position === 1 && typeOfMaterial === 'CR') { // 008/18 frequency
return true;
}
if (position === 4 && typeOfMaterial === 'CR') { // 008/21 type of continuing resource
return true;
}
// Skip map 006/05-06 on purpose
if ([5, 6].includes(position) && typeOfMaterial === 'MP') { // 008/22 form of original item
return true;
}
if (position === 6 && ['BK', 'CR', 'MU', 'MX'].includes(typeOfMaterial)) { // 008/23 form of item '#' means "none of the following" 008/23
return true;
}
if (position === 7 && typeOfMaterial === 'CR') { // 008/22 nature of entire work
return true;
}
if (position === 11 && ['BK', 'CF', 'CR', 'MP', 'VM'].includes(typeOfMaterial)) { // 008/28 government publication
return true;
}
if (position === 12 && ['MP', 'VM'].includes(typeOfMaterial)) { // 008/29 form of item '#' means "none of the following"
return true;
}
if (position === 13 && ['MU'].includes(typeOfMaterial)) { // 008/30 Literary text for sound recordings (code 1) (008/31 code is fine/meaningless, if 008/30 is a-z...)
return true;
}
if (position === 17 && typeOfMaterial === 'BK') { // 008/34 technique
return true;
}
return false;
}
}

const map06CharPos00ToTypeOfMaterial = {'a': 'BK', 'c': 'MU', 'd': 'MU', 'e': 'MP', 'f': 'MP', 'g': 'VM', 'i': 'MU', 'j': 'MU', 'k': 'VM', 'm': 'CF', 'o': 'VM', 'p': 'MX', 'r': 'VM', 's': 'CR', 't': 'BK'};

function mapFieldToTypeOfMaterial(field) {
const c = field.value.charAt(0); // stuupid eslint complains about field.value[0]...
if (c in map06CharPos00ToTypeOfMaterial) {
return map06CharPos00ToTypeOfMaterial[c];
}
return undefined;
}
Loading