Skip to content

Commit a8df8ef

Browse files
committed
Differentiate synonyms by priority
1 parent b8a895d commit a8df8ef

3 files changed

Lines changed: 457 additions & 11 deletions

File tree

flowmapper/match.py

Lines changed: 5 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,10 @@
99
rm_parentheses_roman_numerals,
1010
rm_roman_numerals_ionic_state,
1111
)
12+
from flowmapper.preferred_synonyms import (
13+
match_identical_names_in_preferred_synonyms,
14+
match_identical_names_in_synonyms,
15+
)
1216

1317
logger = logging.getLogger(__name__)
1418

@@ -26,17 +30,6 @@ def match_identical_identifier(s: Flow, t: Flow, comment: str = "Identical ident
2630
return {"comment": comment}
2731

2832

29-
def match_identical_names_in_synonyms(
30-
s: Flow, t: Flow, comment: str = "Identical synonyms"
31-
):
32-
if (
33-
(t.synonyms and s.name in t.synonyms and s.context == t.context)
34-
or (s.synonyms and t.name in s.synonyms and s.context == t.context)
35-
# and not math.isnan(s.unit.conversion_factor(t.unit)):
36-
):
37-
return {"comment": comment}
38-
39-
4033
def match_identical_cas_numbers(
4134
s: Flow, t: Flow, comment: str = "Identical CAS numbers"
4235
):
@@ -247,5 +240,6 @@ def match_rules():
247240
match_identical_cas_numbers,
248241
match_non_ionic_state,
249242
match_biogenic_to_non_fossil,
243+
match_identical_names_in_preferred_synonyms,
250244
match_identical_names_in_synonyms,
251245
]

flowmapper/preferred_synonyms.py

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
import re
2+
3+
from flowmapper.flow import Flow
4+
5+
ROMAN_NUMERAL_PATTERN = re.compile(r"\b\(?[ivx]+[\+-]?\)?\s*$", flags=re.IGNORECASE)
6+
PARENTHESES_PATTERN = re.compile(r"\([1-9]+[\+-]?\)\s*$")
7+
8+
9+
def has_roman_numeral_at_end(text: str) -> bool:
10+
"""
11+
Check if a string ends with a roman numeral.
12+
13+
Args:
14+
text (str): The string to check
15+
16+
Returns:
17+
bool: True if the string ends with a roman numeral, False otherwise
18+
19+
"""
20+
return bool(ROMAN_NUMERAL_PATTERN.search(text))
21+
22+
23+
def has_number_pattern_at_end(text: str) -> bool:
24+
"""
25+
Check if a string ends with a pattern like "(2+)".
26+
27+
Args:
28+
text (str): The string to check
29+
30+
Returns:
31+
bool: True if the string ends with the number pattern, False otherwise
32+
33+
"""
34+
return bool(PARENTHESES_PATTERN.search(text))
35+
36+
37+
def match_identical_names_in_preferred_synonyms(
38+
s: Flow, t: Flow, comment: str = "Identical preferred synonyms"
39+
):
40+
if t.synonyms and s.name in t.synonyms and s.context == t.context:
41+
if s.name.normalized in t.name.normalized and (
42+
has_roman_numeral_at_end(t.name.normalized)
43+
or has_number_pattern_at_end(t.name.normalized)
44+
):
45+
return {"comment": comment}
46+
elif s.synonyms and t.name in s.synonyms and s.context == t.context:
47+
if t.name.normalized in s.name.normalized and (
48+
has_roman_numeral_at_end(s.name.normalized)
49+
or has_number_pattern_at_end(s.name.normalized)
50+
):
51+
return {"comment": comment}
52+
53+
54+
def match_identical_names_in_synonyms(
55+
s: Flow, t: Flow, comment: str = "Identical synonyms"
56+
):
57+
if (t.synonyms and s.name in t.synonyms and s.context == t.context) or (
58+
s.synonyms and t.name in s.synonyms and s.context == t.context
59+
):
60+
return {"comment": comment}

0 commit comments

Comments
 (0)