Skip to content

Commit 5e73bb4

Browse files
luis100claude
andcommitted
Phase 0: EmailArchive nested-document metadata schema (#3660)
Introduces the EmailArchive metadata type as the reference implementation for Solr nested-document support in RODA — fully config-driven, zero Java. - emailarchive.xsd: XML schema (parent mailbox + child email elements) - emailarchive.xslt: ingest crosswalk producing nested Solr child docs via <field name="emails"><doc>…</doc></field> blocks; follows rakenskapsinfo pattern - Register type in roda-wui.properties and i18n ServerMessages.properties - EmailArchiveCrosswalkTest: 12 TestNG tests (full/minimal/no-emails fixtures) covering parent fields, date fields, child count, multi-value recipients, and absent-optional-field assertions Part of: #3382 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent bf7bed0 commit 5e73bb4

9 files changed

Lines changed: 433 additions & 0 deletions

File tree

roda-core/roda-core-tests/src/main/java/org/roda/core/CorporaConstants.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,10 @@ public final class CorporaConstants {
9494

9595
public static final String SOURCE_DESC_METADATA_CONTAINER = "DescriptiveMetadata";
9696
public static final String STRANGE_DESC_METADATA_FILE = "strange.xml";
97+
public static final String EMAIL_ARCHIVE_FULL_FILE = "emailarchive_full.xml";
98+
public static final String EMAIL_ARCHIVE_MINIMAL_FILE = "emailarchive_minimal.xml";
99+
public static final String EMAIL_ARCHIVE_NO_EMAILS_FILE = "emailarchive_no_emails.xml";
100+
public static final String EMAIL_ARCHIVE_METADATA_TYPE = "emailarchive";
97101

98102
public static final String TEXT_XML = "text/xml";
99103
public static final String REPRESENTATION_1_PREMIS_EVENT_ID = "urn:roda:premis:event:roda_398";
Lines changed: 239 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,239 @@
1+
/**
2+
* The contents of this file are subject to the license and copyright
3+
* detailed in the LICENSE file at the root of the source
4+
* tree and available online at
5+
*
6+
* https://github.com/keeps/roda
7+
*/
8+
package org.roda.core.index;
9+
10+
import static org.testng.AssertJUnit.assertEquals;
11+
import static org.testng.AssertJUnit.assertNotNull;
12+
import static org.testng.AssertJUnit.assertNull;
13+
14+
import java.io.IOException;
15+
import java.net.URISyntaxException;
16+
import java.net.URL;
17+
import java.nio.file.Path;
18+
import java.nio.file.Paths;
19+
import java.util.Collection;
20+
21+
import org.apache.solr.common.SolrInputDocument;
22+
import org.apache.solr.common.SolrInputField;
23+
import org.roda.core.CorporaConstants;
24+
import org.roda.core.RodaCoreFactory;
25+
import org.roda.core.data.common.RodaConstants;
26+
import org.roda.core.data.exceptions.GenericException;
27+
import org.roda.core.data.exceptions.NotFoundException;
28+
import org.roda.core.data.exceptions.RODAException;
29+
import org.roda.core.index.utils.SolrUtils;
30+
import org.roda.core.storage.Binary;
31+
import org.roda.core.storage.DefaultStoragePath;
32+
import org.roda.core.storage.StorageService;
33+
import org.roda.core.storage.fs.FileStorageService;
34+
import org.testng.Assert;
35+
import org.testng.annotations.AfterMethod;
36+
import org.testng.annotations.BeforeClass;
37+
import org.testng.annotations.BeforeMethod;
38+
import org.testng.annotations.Test;
39+
40+
@Test(groups = {RodaConstants.TEST_GROUP_ALL, RodaConstants.TEST_GROUP_DEV, RodaConstants.TEST_GROUP_TRAVIS})
41+
public class EmailArchiveCrosswalkTest {
42+
43+
private static StorageService corporaService;
44+
45+
@BeforeClass
46+
public static void setUp() throws URISyntaxException, GenericException {
47+
URL corporaURL = IndexServiceTest.class.getResource("/corpora");
48+
Path corporaPath = Paths.get(corporaURL.toURI());
49+
corporaService = new FileStorageService(corporaPath);
50+
}
51+
52+
@BeforeMethod
53+
public void init() {
54+
RodaCoreFactory.instantiateTest(false, false, false, false, false, false, false);
55+
}
56+
57+
@AfterMethod
58+
public void cleanup() throws NotFoundException, GenericException, IOException {
59+
RodaCoreFactory.shutdown();
60+
}
61+
62+
// ---------------------------------------------------------------------------
63+
// Full fixture — 3 emails
64+
// ---------------------------------------------------------------------------
65+
66+
@Test
67+
public void testFullCrosswalkProducesParentFields() throws RODAException {
68+
SolrInputDocument doc = getCrosswalkResult(CorporaConstants.EMAIL_ARCHIVE_FULL_FILE);
69+
70+
assertNotNull(doc);
71+
assertFieldValue(doc, "custodian_txt", "João Silva");
72+
assertFieldValue(doc, "emailAddress_s", "joao.silva@empresa.pt");
73+
assertFieldValue(doc, "totalMessages_i", "3");
74+
assertFieldValue(doc, "originalFormat_s", "PST");
75+
assertFieldValue(doc, "archivingMotive_txt", "Offboarding");
76+
assertFieldValue(doc, "content_type", "emailarchive");
77+
}
78+
79+
@Test
80+
public void testFullCrosswalkProducesDateFields() throws RODAException {
81+
SolrInputDocument doc = getCrosswalkResult(CorporaConstants.EMAIL_ARCHIVE_FULL_FILE);
82+
83+
assertNotNull(doc);
84+
assertFieldValue(doc, "dateStart_dt", "2020-01-01T00:00:00Z");
85+
assertFieldValue(doc, "dateEnd_dt", "2023-12-31T00:00:00Z");
86+
}
87+
88+
@Test
89+
public void testFullCrosswalkProducesThreeChildDocuments() throws RODAException {
90+
SolrInputDocument doc = getCrosswalkResult(CorporaConstants.EMAIL_ARCHIVE_FULL_FILE);
91+
92+
assertNotNull(doc);
93+
SolrInputField emailsField = doc.getField("emails");
94+
assertNotNull("'emails' field must be present for nested children", emailsField);
95+
96+
Collection<SolrInputDocument> children = getChildDocuments(emailsField);
97+
assertEquals("Expected 3 child email documents", 3, children.size());
98+
}
99+
100+
@Test
101+
public void testFullCrosswalkFirstChildFields() throws RODAException {
102+
SolrInputDocument doc = getCrosswalkResult(CorporaConstants.EMAIL_ARCHIVE_FULL_FILE);
103+
SolrInputDocument first = getChildAt(doc, 0);
104+
105+
assertFieldValue(first, "content_type", "email");
106+
assertFieldValue(first, "messageId_s", "<msg001@empresa.pt>");
107+
assertFieldValue(first, "subject_txt", "Quarterly Report Q1 2021");
108+
assertFieldValue(first, "sender_s", "joao.silva@empresa.pt");
109+
assertFieldValue(first, "sentDate_dt", "2021-03-15T09:42:00Z");
110+
assertFieldValue(first, "folderPath_s", "Inbox/Projects");
111+
assertFieldValue(first, "hasAttachments_b", "true");
112+
assertFieldValue(first, "filePath_s", "Inbox/Projects/msg_001.eml");
113+
}
114+
115+
@Test
116+
public void testFullCrosswalkMultipleRecipients() throws RODAException {
117+
SolrInputDocument doc = getCrosswalkResult(CorporaConstants.EMAIL_ARCHIVE_FULL_FILE);
118+
SolrInputDocument first = getChildAt(doc, 0);
119+
120+
// First email has two recipients: ana.costa and rui.pinto
121+
SolrInputField recipientsField = first.getField("recipients_txt");
122+
assertNotNull("recipients_txt field must be present", recipientsField);
123+
Collection<?> values = recipientsField.getValues();
124+
assertNotNull(values);
125+
assertEquals("Expected 2 recipient values", 2, values.size());
126+
}
127+
128+
@Test
129+
public void testFullCrosswalkThirdChildFields() throws RODAException {
130+
SolrInputDocument doc = getCrosswalkResult(CorporaConstants.EMAIL_ARCHIVE_FULL_FILE);
131+
SolrInputDocument third = getChildAt(doc, 2);
132+
133+
assertFieldValue(third, "subject_txt", "Budget Approval Request");
134+
assertFieldValue(third, "folderPath_s", "Sent");
135+
assertFieldValue(third, "filePath_s", "Sent/msg_003.eml");
136+
}
137+
138+
// ---------------------------------------------------------------------------
139+
// Minimal fixture — 1 email, only required fields
140+
// ---------------------------------------------------------------------------
141+
142+
@Test
143+
public void testMinimalCrosswalkProducesRequiredFields() throws RODAException {
144+
SolrInputDocument doc = getCrosswalkResult(CorporaConstants.EMAIL_ARCHIVE_MINIMAL_FILE);
145+
146+
assertNotNull(doc);
147+
assertFieldValue(doc, "custodian_txt", "Jane Doe");
148+
assertFieldValue(doc, "emailAddress_s", "jane.doe@example.org");
149+
assertFieldValue(doc, "content_type", "emailarchive");
150+
}
151+
152+
@Test
153+
public void testMinimalCrosswalkOmitsAbsentOptionalFields() throws RODAException {
154+
SolrInputDocument doc = getCrosswalkResult(CorporaConstants.EMAIL_ARCHIVE_MINIMAL_FILE);
155+
156+
assertNotNull(doc);
157+
assertNull("dateStart_dt should be absent when not in source", doc.getField("dateStart_dt"));
158+
assertNull("dateEnd_dt should be absent when not in source", doc.getField("dateEnd_dt"));
159+
assertNull("totalMessages_i should be absent when not in source", doc.getField("totalMessages_i"));
160+
assertNull("originalFormat_s should be absent when not in source", doc.getField("originalFormat_s"));
161+
assertNull("archivingMotive_txt should be absent when not in source", doc.getField("archivingMotive_txt"));
162+
}
163+
164+
@Test
165+
public void testMinimalCrosswalkProducesOneChild() throws RODAException {
166+
SolrInputDocument doc = getCrosswalkResult(CorporaConstants.EMAIL_ARCHIVE_MINIMAL_FILE);
167+
SolrInputField emailsField = doc.getField("emails");
168+
assertNotNull(emailsField);
169+
assertEquals(1, getChildDocuments(emailsField).size());
170+
}
171+
172+
@Test
173+
public void testMinimalCrosswalkChildHasRequiredFields() throws RODAException {
174+
SolrInputDocument doc = getCrosswalkResult(CorporaConstants.EMAIL_ARCHIVE_MINIMAL_FILE);
175+
SolrInputDocument child = getChildAt(doc, 0);
176+
177+
assertFieldValue(child, "content_type", "email");
178+
assertFieldValue(child, "messageId_s", "<only-email@example.org>");
179+
assertFieldValue(child, "subject_txt", "Hello World");
180+
assertFieldValue(child, "hasAttachments_b", "false");
181+
}
182+
183+
// ---------------------------------------------------------------------------
184+
// No-emails fixture — mailbox with zero email records
185+
// ---------------------------------------------------------------------------
186+
187+
@Test
188+
public void testNoEmailsCrosswalkProducesParentFieldsOnly() throws RODAException {
189+
SolrInputDocument doc = getCrosswalkResult(CorporaConstants.EMAIL_ARCHIVE_NO_EMAILS_FILE);
190+
191+
assertNotNull(doc);
192+
assertFieldValue(doc, "custodian_txt", "Empty Mailbox User");
193+
assertFieldValue(doc, "content_type", "emailarchive");
194+
assertFieldValue(doc, "totalMessages_i", "0");
195+
}
196+
197+
@Test
198+
public void testNoEmailsCrosswalkProducesNoChildDocumentsField() throws RODAException {
199+
SolrInputDocument doc = getCrosswalkResult(CorporaConstants.EMAIL_ARCHIVE_NO_EMAILS_FILE);
200+
assertNull("'emails' field must be absent when there are no child emails", doc.getField("emails"));
201+
}
202+
203+
// ---------------------------------------------------------------------------
204+
// Helpers
205+
// ---------------------------------------------------------------------------
206+
207+
private SolrInputDocument getCrosswalkResult(String filename) throws RODAException {
208+
try {
209+
DefaultStoragePath path = DefaultStoragePath.parse(
210+
CorporaConstants.SOURCE_DESC_METADATA_CONTAINER, filename);
211+
Binary binary = corporaService.getBinary(path);
212+
return SolrUtils.getDescriptiveMetadataFields(binary, CorporaConstants.EMAIL_ARCHIVE_METADATA_TYPE, null);
213+
} catch (Exception e) {
214+
Assert.fail("Unexpected exception loading fixture '" + filename + "': " + e.getMessage());
215+
return null;
216+
}
217+
}
218+
219+
private void assertFieldValue(SolrInputDocument doc, String fieldName, String expectedValue) {
220+
SolrInputField field = doc.getField(fieldName);
221+
assertNotNull("Field '" + fieldName + "' must be present", field);
222+
assertEquals("Field '" + fieldName + "' value mismatch", expectedValue, field.getValue().toString());
223+
}
224+
225+
@SuppressWarnings("unchecked")
226+
private Collection<SolrInputDocument> getChildDocuments(SolrInputField emailsField) {
227+
Object value = emailsField.getValue();
228+
assertNotNull("'emails' field value must not be null", value);
229+
return (Collection<SolrInputDocument>) value;
230+
}
231+
232+
private SolrInputDocument getChildAt(SolrInputDocument parent, int index) {
233+
SolrInputField emailsField = parent.getField("emails");
234+
assertNotNull(emailsField);
235+
Collection<SolrInputDocument> children = getChildDocuments(emailsField);
236+
return children.stream().skip(index).findFirst()
237+
.orElseThrow(() -> new AssertionError("No child document at index " + index));
238+
}
239+
}
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<emailArchive xmlns="https://roda-community.org/schemas/emailarchive/v1">
3+
<custodian>João Silva</custodian>
4+
<emailAddress>joao.silva@empresa.pt</emailAddress>
5+
<dateStart>2020-01-01</dateStart>
6+
<dateEnd>2023-12-31</dateEnd>
7+
<totalMessages>3</totalMessages>
8+
<originalFormat>PST</originalFormat>
9+
<archivingMotive>Offboarding</archivingMotive>
10+
11+
<email>
12+
<messageId>&lt;msg001@empresa.pt&gt;</messageId>
13+
<subject>Quarterly Report Q1 2021</subject>
14+
<sender>joao.silva@empresa.pt</sender>
15+
<recipients>ana.costa@empresa.pt</recipients>
16+
<recipients>rui.pinto@empresa.pt</recipients>
17+
<sentDate>2021-03-15T09:42:00Z</sentDate>
18+
<folderPath>Inbox/Projects</folderPath>
19+
<hasAttachments>true</hasAttachments>
20+
<filePath>Inbox/Projects/msg_001.eml</filePath>
21+
</email>
22+
23+
<email>
24+
<messageId>&lt;msg002@empresa.pt&gt;</messageId>
25+
<subject>Re: Quarterly Report Q1 2021</subject>
26+
<sender>ana.costa@empresa.pt</sender>
27+
<recipients>joao.silva@empresa.pt</recipients>
28+
<sentDate>2021-03-15T11:05:00Z</sentDate>
29+
<folderPath>Inbox/Projects</folderPath>
30+
<hasAttachments>false</hasAttachments>
31+
<filePath>Inbox/Projects/msg_002.eml</filePath>
32+
</email>
33+
34+
<email>
35+
<messageId>&lt;msg003@empresa.pt&gt;</messageId>
36+
<subject>Budget Approval Request</subject>
37+
<sender>joao.silva@empresa.pt</sender>
38+
<recipients>board@empresa.pt</recipients>
39+
<sentDate>2021-06-01T08:00:00Z</sentDate>
40+
<folderPath>Sent</folderPath>
41+
<hasAttachments>true</hasAttachments>
42+
<filePath>Sent/msg_003.eml</filePath>
43+
</email>
44+
</emailArchive>
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<emailArchive xmlns="https://roda-community.org/schemas/emailarchive/v1">
3+
<custodian>Jane Doe</custodian>
4+
<emailAddress>jane.doe@example.org</emailAddress>
5+
6+
<email>
7+
<messageId>&lt;only-email@example.org&gt;</messageId>
8+
<subject>Hello World</subject>
9+
<sender>sender@example.org</sender>
10+
<recipients>jane.doe@example.org</recipients>
11+
<sentDate>2022-05-10T14:30:00Z</sentDate>
12+
<hasAttachments>false</hasAttachments>
13+
<filePath>Inbox/msg_0001.eml</filePath>
14+
</email>
15+
</emailArchive>
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<!-- Edge case: mailbox with metadata but no email records yet indexed -->
3+
<emailArchive xmlns="https://roda-community.org/schemas/emailarchive/v1">
4+
<custodian>Empty Mailbox User</custodian>
5+
<emailAddress>empty@example.org</emailAddress>
6+
<dateStart>2023-01-01</dateStart>
7+
<dateEnd>2023-01-01</dateEnd>
8+
<totalMessages>0</totalMessages>
9+
<originalFormat>MBOX</originalFormat>
10+
<archivingMotive>Legal Hold</archivingMotive>
11+
</emailArchive>

0 commit comments

Comments
 (0)