Skip to content

Commit 89bb1f5

Browse files
committed
Add source stat model
Save source stat information Update sources who have corrected their earliestDatestamp Catch and log all source exceptions
1 parent 9ca2282 commit 89bb1f5

10 files changed

Lines changed: 263 additions & 122 deletions

File tree

project/settings.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -327,6 +327,12 @@
327327
'schedule': crontab(hour=23, minute=30),
328328
'args': (1, 'elasticsearch'),
329329
},
330+
# Executes daily at 11:30 P.M
331+
'source-stat-task': {
332+
'task': 'share.tasks.SourceStatTask',
333+
'schedule': crontab(hour=23, minute=30),
334+
'args': (),
335+
},
330336
}
331337

332338
CELERY_TASK_SERIALIZER = 'json'
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
# -*- coding: utf-8 -*-
2+
# Generated by Django 1.11.1 on 2017-06-01 17:03
3+
from __future__ import unicode_literals
4+
5+
from django.db import migrations, models
6+
import django.db.models.deletion
7+
import share.models.ingest
8+
9+
10+
class Migration(migrations.Migration):
11+
12+
dependencies = [
13+
('share', '0033_merge_20170509_1710'),
14+
]
15+
16+
operations = [
17+
migrations.CreateModel(
18+
name='SourceStat',
19+
fields=[
20+
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
21+
('is_deleted', models.BooleanField(default=False)),
22+
('date_created', models.DateTimeField(auto_now_add=True)),
23+
('response_status_code', models.SmallIntegerField(blank=True, null=True)),
24+
('response_elapsed_time', models.FloatField(blank=True, null=True)),
25+
('response_exception', models.TextField(blank=True, null=True)),
26+
('earliest_datestamp_config', models.DateField(blank=True, null=True)),
27+
('base_url_config', models.TextField()),
28+
('admin_note', models.TextField(blank=True)),
29+
],
30+
managers=[
31+
('objects', share.models.ingest.NaturalKeyManager('config.label')),
32+
],
33+
),
34+
migrations.CreateModel(
35+
name='SourceStatOAI',
36+
fields=[
37+
('sourcestat_ptr', models.OneToOneField(auto_created=True, on_delete=django.db.models.deletion.CASCADE, parent_link=True, primary_key=True, serialize=False, to='share.SourceStat')),
38+
('earliest_datestamp_source', models.DateField(blank=True, null=True)),
39+
('earliest_datestamps_match', models.BooleanField(default=False)),
40+
('base_url_source', models.TextField(blank=True, null=True)),
41+
('base_urls_match', models.BooleanField(default=False)),
42+
],
43+
bases=('share.sourcestat',),
44+
),
45+
migrations.AddField(
46+
model_name='sourcestat',
47+
name='config',
48+
field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='share.SourceConfig'),
49+
),
50+
]

share/models/__init__.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from share.models.registration import * # noqa
1111
from share.models.identifiers import * # noqa
1212
from share.models.relations import * # noqa
13-
from share.models.banner import * # noqa
14-
from share.models.ingest import * # noqa
15-
from share.models.logs import * # noqa
13+
from share.models.banner import * # noqa
14+
from share.models.ingest import * # noqa
15+
from share.models.logs import * # noqa
16+
from share.models.sources import * # noqa

share/models/sources.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
import logging
2+
3+
from django.db import models
4+
5+
from share.models.ingest import SourceConfig
6+
from share.models.ingest import NaturalKeyManager
7+
8+
logger = logging.getLogger(__name__)
9+
__all__ = ('SourceStat', 'SourceStatOAI',)
10+
11+
12+
class SourceStat(models.Model):
13+
config = models.ForeignKey(SourceConfig, on_delete=models.CASCADE)
14+
is_deleted = models.BooleanField(default=False)
15+
date_created = models.DateTimeField(auto_now_add=True)
16+
response_status_code = models.SmallIntegerField(blank=True, null=True)
17+
response_elapsed_time = models.FloatField(blank=True, null=True)
18+
response_exception = models.TextField(blank=True, null=True)
19+
earliest_datestamp_config = models.DateField(blank=True, null=True)
20+
base_url_config = models.TextField()
21+
admin_note = models.TextField(blank=True)
22+
23+
objects = NaturalKeyManager('config.label')
24+
25+
def natural_key(self):
26+
return (self.config.label,)
27+
28+
def __repr__(self):
29+
return '<{}({}, {})>'.format(self.__class__.__name__, self.pk, self.config.label)
30+
31+
def __str__(self):
32+
return '{}: {}'.format(self.config.source.long_title, self.config.label)
33+
34+
35+
class SourceStatOAI(SourceStat):
36+
earliest_datestamp_source = models.DateField(blank=True, null=True)
37+
earliest_datestamps_match = models.BooleanField(default=False)
38+
39+
base_url_source = models.TextField(blank=True, null=True)
40+
base_urls_match = models.BooleanField(default=False)

share/sources/be.ghent/source.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
configs:
22
- base_url: https://biblio.ugent.be/oai
33
disabled: false
4-
earliest_date: null # earliestDatestamp is earliest published
4+
earliest_date: 2016-12-14T15:38:10Z
55
harvester: oai
66
harvester_kwargs: {metadata_prefix: mods}
77
label: be.ghent.mods

share/sources/ch.cern/source.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
configs:
22
- base_url: http://cds.cern.ch/oai2d
33
disabled: false
4-
earliest_date: null # earliestDatestamp is earliest published
4+
earliest_date: 2003-06-02T08:06:23Z
55
harvester: oai
66
harvester_kwargs: {metadata_prefix: oai_dc}
77
label: ch.cern

share/sources/edu.udc/source.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
configs:
22
- base_url: http://conservancy.umn.edu/oai/request
33
disabled: false
4-
earliest_date: null # earliestDatestamp is earliest published
4+
earliest_date: 2007-02-19T20:04:59Z
55
harvester: oai
66
harvester_kwargs: {metadata_prefix: mods}
77
label: edu.udc.mods

share/sources/es.csic/source.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
configs:
33
- base_url: http://digital.csic.es/dspace-oai/request
44
disabled: false
5-
earliest_date: null # earliestDatestamp is earliest published
5+
earliest_date: 2007-05-07T22:00:00Z
66
harvester: oai
77
harvester_kwargs: {metadata_prefix: mods}
88
label: es.csic.mods

share/tasks.py

Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88
import pendulum
99
import celery
1010
import requests
11+
from furl import furl
12+
from lxml import etree
1113

1214
from django.apps import apps
1315
from django.conf import settings
@@ -21,6 +23,7 @@
2123
from share.harvest.exceptions import HarvesterConcurrencyError, HarvesterDisabledError
2224
from share.models import HarvestLog
2325
from share.models import Source
26+
from share.models import SourceStat, SourceStatOAI
2427
from share.models import RawDatum, NormalizedData, ChangeSet, CeleryTask, CeleryProviderTask, ShareUser, SourceConfig
2528

2629

@@ -407,3 +410,159 @@ def do_run(self, last_run=None, **kwargs):
407410
bot = self.config.get_bot(self.started_by, last_run=last_run, **kwargs)
408411
logger.info('Running bot %s. Started by %s', bot, self.started_by)
409412
bot.run()
413+
414+
415+
NAMESPACES = {
416+
'dc': 'http://purl.org/dc/elements/1.1/',
417+
'ns0': 'http://www.openarchives.org/OAI/2.0/',
418+
'oai_dc': 'http://www.openarchives.org/OAI/2.0/oai_dc/',
419+
}
420+
421+
422+
def get_field_from_identify(response, field):
423+
parsed = etree.fromstring(response.content, parser=etree.XMLParser(recover=True))
424+
return parsed.xpath('//ns0:Identify/ns0:{}'.format(field), namespaces=NAMESPACES)[0].text
425+
426+
427+
def assert_no_exception(url, timeout=15.0):
428+
try:
429+
r = requests.get(url, timeout=timeout)
430+
# except all exception and log
431+
except Exception as e:
432+
logger.warning('Exception received from source: %s', e)
433+
return (None, e)
434+
return (r, None)
435+
436+
437+
@celery.task(bind=True)
438+
def get_oai_source_stats(self, *args, **kwargs):
439+
'''
440+
Known incorrect baseUrl:
441+
'https://biblio.ugent.be/oai', # homepage listed
442+
'http://purr.purdue.edu/oaipmh', # homepage listed
443+
'https://www.ncbi.nlm.nih.gov/pmc/oai/oai.cgi', # incorrect url listed
444+
'https://mla.hcommons.org/deposits/oai/', # incorrect url listed
445+
'http://oai.repec.org', # listed url redirects to this
446+
447+
448+
Known incorrect earliestDatestamp (all emailed):
449+
'edu.oaktrust.mods', # returns the most recent datestamp
450+
'edu.scholarsarchiveosu.mods', # returns 0011-01-01
451+
'edu.uwashington.mods', # returns 2083-03-01
452+
'gov.nodc', # 1996-10-09
453+
'org.philpapers', # 1990-01-01T00:00:00Z
454+
'org.ttu.mods', # 1989-05-01T05:00:00Z
455+
'edu.umich.mods', # 1983-01-01T05:00:00Z
456+
'edu.citeseerx', # 1970-01-01
457+
'br.pcurio', # 1970-01-01
458+
'edu.vtech.mods', # 1900-02-02T05:00:00Z
459+
'edu.icpsr', # 01-01-1900
460+
'pt.rcaap', # 1900-01-01T00:00:00Z
461+
'org.ucescholarship', # 1885-05-01
462+
'com.nature', # 1869-11-04
463+
'''
464+
465+
base_url_config = kwargs['base_url']
466+
base_url_source = None
467+
base_urls_match = False
468+
config_id = kwargs['id']
469+
earliest_datestamp_config = kwargs['earliest_date'] or None
470+
earliest_datestamp_source = None
471+
earliest_datestamps_match = False
472+
response_exception = True
473+
response_elapsed_time = 0
474+
response_status_code = None
475+
476+
response, error = assert_no_exception(furl(base_url_config).set({'verb': 'Identify'}).url)
477+
response_exception = error
478+
479+
if response:
480+
base_url_source = get_field_from_identify(response, 'baseURL')
481+
# ignores http vs https
482+
if len(base_url_source.split('://', 1)) > 1:
483+
base_urls_match = base_url_source.split('://', 1)[1] == base_url_config.split('://', 1)[1]
484+
else:
485+
logger.warning('Source baseURL is improper: %s', base_url_source)
486+
487+
earliest_datestamp_identify = get_field_from_identify(response, 'earliestDatestamp')
488+
earliest_datestamp_source = pendulum.parse(earliest_datestamp_identify).to_date_string() if earliest_datestamp_identify else None
489+
earliest_datestamps_match = earliest_datestamp_config == earliest_datestamp_source
490+
491+
response_elapsed_time = response.elapsed.total_seconds()
492+
response_status_code = response.status_code
493+
494+
source_stat = SourceStatOAI.objects.create(
495+
earliest_datestamp_source=earliest_datestamp_source,
496+
earliest_datestamp_config=earliest_datestamp_config,
497+
earliest_datestamps_match=earliest_datestamps_match,
498+
499+
base_url_source=base_url_source,
500+
base_url_config=base_url_config,
501+
base_urls_match=base_urls_match,
502+
503+
config_id=config_id,
504+
505+
response_status_code=response_status_code,
506+
response_elapsed_time=response_elapsed_time,
507+
response_exception=response_exception,
508+
)
509+
source_stat.save()
510+
511+
512+
@celery.task(bind=True)
513+
def get_source_stats(self, *args, **kwargs):
514+
base_url_config = kwargs['base_url']
515+
config_id = kwargs['id']
516+
earliest_datestamp_config = kwargs['earliest_date'] or None
517+
response_exception = True
518+
response_elapsed_time = None
519+
response_status_code = None
520+
521+
response, error = assert_no_exception(base_url_config)
522+
response_exception = error
523+
524+
if response:
525+
response_elapsed_time = response.elapsed.total_seconds()
526+
response_status_code = response.status_code
527+
528+
source_stat = SourceStatOAI.objects.create(
529+
earliest_datestamp_config=earliest_datestamp_config,
530+
base_url_config=base_url_config,
531+
config_id=config_id,
532+
response_status_code=response_status_code,
533+
response_elapsed_time=response_elapsed_time,
534+
response_exception=response_exception,
535+
)
536+
source_stat.save()
537+
538+
539+
class SourceStatusTask(celery.Task):
540+
541+
def run(self):
542+
self.get_oai_stats.apply_async()
543+
self.get_non_oai_stats.apply_async()
544+
545+
@celery.task(bind=True)
546+
def get_oai_stats(self):
547+
oai_sourceconfigs = SourceConfig.objects.filter(
548+
disabled=False,
549+
base_url__isnull=False,
550+
harvester__key='oai'
551+
)
552+
553+
for config in oai_sourceconfigs.values():
554+
config['earliest_date'] = config['earliest_date'].isoformat() if config['earliest_date'] else None
555+
get_oai_source_stats.apply_async(kwargs=config)
556+
557+
@celery.task(bind=True)
558+
def get_non_oai_stats(self):
559+
non_oai_sourceconfigs = SourceConfig.objects.filter(
560+
disabled=False,
561+
base_url__isnull=False
562+
).exclude(
563+
harvester__key='oai'
564+
)
565+
566+
for config in non_oai_sourceconfigs.values():
567+
config['earliest_date'] = config['earliest_date'].isoformat() if config['earliest_date'] else None
568+
get_source_stats.apply_async(kwargs=config)

0 commit comments

Comments
 (0)