Skip to content
This repository was archived by the owner on Oct 1, 2024. It is now read-only.

Commit 92583aa

Browse files
committed
Moved xml parsing to dds_glossary/xml.py
1 parent 5fb5821 commit 92583aa

2 files changed

Lines changed: 130 additions & 63 deletions

File tree

dds_glossary/model.py

Lines changed: 31 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
"""Model classes for the dds_glossary package."""
22

33
from abc import abstractmethod
4-
from collections import defaultdict
54
from typing import ClassVar
65

76
from pydantic import BaseModel
@@ -10,6 +9,13 @@
109
from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, relationship
1110

1211
from .enums import MemberType, SemanticRelationType
12+
from .xml import (
13+
get_element_attribute,
14+
get_sub_element_as_str,
15+
get_sub_element_attributes,
16+
get_sub_elements_as_dict,
17+
get_sub_elements_as_dict_of_lists,
18+
)
1319

1420

1521
class Dataset(BaseModel):
@@ -43,28 +49,10 @@ class Base(DeclarativeBase):
4349
dict[str, str]: JSONB,
4450
dict[str, list[str]]: JSONB,
4551
}
46-
xml_namespace: ClassVar[str] = "{http://www.w3.org/XML/1998/namespace}"
4752

4853
def __eq__(self, other: object) -> bool:
4954
return self.to_dict() == other.to_dict() # type: ignore
5055

51-
@staticmethod
52-
def get_sub_element_text(element, tag: str, default_value: str = "") -> str:
53-
"""
54-
Get a sub element text from the XML element if tag exists, else return
55-
default_value.
56-
57-
Args:
58-
element (ElementBase): The XML element to parse.
59-
tag (str): The tag to search for.
60-
default_value (str): The default value to return if the tag does not exist.
61-
62-
Returns:
63-
str: The sub element text if the tag exists, else the default value.
64-
"""
65-
sub_element = element.find(tag, namespaces=element.nsmap)
66-
return sub_element.text if sub_element is not None else default_value
67-
6856
@staticmethod
6957
def get_in_language(attribute: dict, lang: str = "en") -> str:
7058
"""
@@ -158,13 +146,10 @@ def from_xml_element(cls, element) -> "ConceptScheme":
158146
ConceptScheme: The parsed ConceptScheme instance.
159147
"""
160148
return ConceptScheme(
161-
iri=element.get(f"{{{element.nsmap['rdf']}}}about"),
162-
notation=cls.get_sub_element_text(element, "core:notation"),
163-
scopeNote=cls.get_sub_element_text(element, "core:scopeNote"),
164-
prefLabels={
165-
label.get(f"{cls.xml_namespace}lang"): label.text
166-
for label in element.findall("core:prefLabel", namespaces=element.nsmap)
167-
},
149+
iri=get_element_attribute(element, "about"),
150+
notation=get_sub_element_as_str(element, "core:notation"),
151+
scopeNote=get_sub_element_as_str(element, "core:scopeNote"),
152+
prefLabels=get_sub_elements_as_dict(element, "core:prefLabel"),
168153
)
169154

170155
def to_dict(self, lang: str = "en") -> dict:
@@ -242,12 +227,7 @@ def get_concept_schemes(
242227
Returns:
243228
list[ConceptScheme]: The concept schemes to which the member belongs.
244229
"""
245-
scheme_iris = [
246-
scheme_element.get(f"{{{element.nsmap['rdf']}}}resource")
247-
for scheme_element in element.findall(
248-
"core:inScheme", namespaces=element.nsmap
249-
)
250-
]
230+
scheme_iris = get_sub_element_attributes(element, "core:inScheme", "resource")
251231
return [
252232
concept_scheme
253233
for concept_scheme in concept_schemes
@@ -316,17 +296,11 @@ def from_xml_element(
316296
Collection: The parsed Collection instance.
317297
"""
318298
return Collection(
319-
iri=element.get(f"{{{element.nsmap['rdf']}}}about"),
320-
notation=cls.get_sub_element_text(element, "core:notation"),
321-
prefLabels={
322-
label.get(f"{cls.xml_namespace}lang"): label.text
323-
for label in element.findall("core:prefLabel", namespaces=element.nsmap)
324-
},
299+
iri=get_element_attribute(element, "about"),
300+
notation=get_sub_element_as_str(element, "core:notation"),
301+
prefLabels=get_sub_elements_as_dict(element, "core:prefLabel"),
325302
concept_schemes=cls.get_concept_schemes(element, concept_schemes),
326-
member_iris=[
327-
member.get(f"{{{element.nsmap['rdf']}}}resource")
328-
for member in element.findall("core:member", namespaces=element.nsmap)
329-
],
303+
member_iris=get_sub_element_attributes(element, "core:member", "resource"),
330304
)
331305

332306
def resolve_members_from_xml(self, members: list[Member]) -> None:
@@ -399,22 +373,13 @@ def from_xml_element(
399373
Returns:
400374
Concept: The parsed Concept instance.
401375
"""
402-
alt_labels = defaultdict(list)
403-
for label in element.findall("core:altLabel", namespaces=element.nsmap):
404-
alt_labels[label.get(f"{cls.xml_namespace}lang")].append(label.text)
405376
return Concept(
406-
iri=element.get(f"{{{element.nsmap['rdf']}}}about"),
407-
identifier=cls.get_sub_element_text(element, "x_1.1:identifier"),
408-
notation=cls.get_sub_element_text(element, "core:notation"),
409-
prefLabels={
410-
label.get(f"{cls.xml_namespace}lang"): label.text
411-
for label in element.findall("core:prefLabel", namespaces=element.nsmap)
412-
},
413-
altLabels=alt_labels,
414-
scopeNotes={
415-
note.get(f"{cls.xml_namespace}lang"): note.text
416-
for note in element.findall("core:scopeNote", namespaces=element.nsmap)
417-
},
377+
iri=get_element_attribute(element, "about"),
378+
identifier=get_sub_element_as_str(element, "x_1.1:identifier"),
379+
notation=get_sub_element_as_str(element, "core:notation"),
380+
prefLabels=get_sub_elements_as_dict(element, "core:prefLabel"),
381+
altLabels=get_sub_elements_as_dict_of_lists(element, "core:altLabel"),
382+
scopeNotes=get_sub_elements_as_dict(element, "core:scopeNote"),
418383
concept_schemes=cls.get_concept_schemes(element, concept_schemes),
419384
)
420385

@@ -518,16 +483,19 @@ def from_xml_element(cls, element) -> list["SemanticRelation"]:
518483
Returns:
519484
list[SemanticRelation]: The parsed list of SemanticRelation instances.
520485
"""
486+
relations: dict[SemanticRelationType, list[str]] = {}
487+
for relation_type in SemanticRelationType:
488+
relations[relation_type] = get_sub_element_attributes(
489+
element, f"core:{relation_type.value}", "resource"
490+
)
521491
return [
522492
SemanticRelation(
523493
type=relation_type,
524-
source_concept_iri=element.get(f"{{{element.nsmap['rdf']}}}about"),
525-
target_concept_iri=relation.get(f"{{{element.nsmap['rdf']}}}resource"),
526-
)
527-
for relation_type in SemanticRelationType
528-
for relation in element.findall(
529-
f"core:{relation_type.value}", namespaces=element.nsmap
494+
source_concept_iri=get_element_attribute(element, "about"),
495+
target_concept_iri=target_concept_iri,
530496
)
497+
for relation_type, target_concept_iris in relations.items()
498+
for target_concept_iri in target_concept_iris
531499
]
532500

533501
def to_dict(self) -> dict:

dds_glossary/xml.py

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
"""XML utilities for the dds_glossary package."""
2+
3+
from collections import defaultdict
4+
from typing import Final
5+
6+
XML_NAMESPACE: Final[str] = "{http://www.w3.org/XML/1998/namespace}"
7+
8+
9+
def get_element_attribute(element, attribute: str, default_value: str = "") -> str:
10+
"""
11+
Get an attribute from the XML element if it exists, else return default_value.
12+
13+
Args:
14+
element (ElementBase): The XML element to parse.
15+
attribute (str): The attribute to search for.
16+
default_value (str): The default value to return if the attribute does not
17+
exist.
18+
19+
Returns:
20+
str: The attribute value if it exists, else the default value.
21+
"""
22+
attribute = element.get(f"{{{element.nsmap['rdf']}}}{attribute}")
23+
return attribute if attribute is not None else default_value
24+
25+
26+
def get_sub_element_attributes(
27+
element,
28+
tag: str,
29+
attribute: str,
30+
default_value: str = "",
31+
) -> list[str]:
32+
"""
33+
Get the attributes of the sub elements.
34+
35+
Args:
36+
element (ElementBase): The XML element to parse.
37+
tag (str): The tag to search for.
38+
attribute (str): The attribute to get.
39+
40+
Returns:
41+
list: The attributes.
42+
"""
43+
return [
44+
get_element_attribute(sub_element, attribute, default_value)
45+
for sub_element in element.findall(tag, namespaces=element.nsmap)
46+
]
47+
48+
49+
def get_sub_element_as_str(element, tag: str, default_value: str = "") -> str:
50+
"""
51+
Get a sub element text from the XML element if tag exists, else return
52+
default_value.
53+
54+
Args:
55+
element (ElementBase): The XML element to parse.
56+
tag (str): The tag to search for.
57+
default_value (str): The default value to return if the tag does not exist.
58+
59+
Returns:
60+
str: The sub element text if the tag exists, else the default value.
61+
"""
62+
sub_element = element.find(tag, namespaces=element.nsmap)
63+
return sub_element.text if sub_element is not None else default_value
64+
65+
66+
def get_sub_elements_as_dict(element, tag: str) -> dict[str, str]:
67+
"""
68+
Get the sub elements as a dictionary.
69+
70+
Args:
71+
element (ElementBase): The XML element to parse.
72+
tag (str): The tag to search for.
73+
74+
Returns:
75+
dict: The labels.
76+
"""
77+
return {
78+
sub_element.get(f"{XML_NAMESPACE}lang"): sub_element.text
79+
for sub_element in element.findall(tag, namespaces=element.nsmap)
80+
}
81+
82+
83+
def get_sub_elements_as_dict_of_lists(element, tag: str) -> dict[str, list[str]]:
84+
"""
85+
Get the sub elements as a dictionary of lists.
86+
87+
Args:
88+
element (ElementBase): The XML element to parse.
89+
tag (str): The tag to search for.
90+
91+
Returns:
92+
dict: The alternative labels.
93+
"""
94+
sub_element_dict = defaultdict(list)
95+
for sub_element in element.findall(tag, namespaces=element.nsmap):
96+
sub_element_dict[sub_element.get(f"{XML_NAMESPACE}lang")].append(
97+
sub_element.text
98+
)
99+
return sub_element_dict

0 commit comments

Comments
 (0)