|
8 | 8 | import pendulum |
9 | 9 | import celery |
10 | 10 | import requests |
| 11 | +from furl import furl |
| 12 | +from lxml import etree |
11 | 13 |
|
12 | 14 | from django.apps import apps |
13 | 15 | from django.conf import settings |
|
21 | 23 | from share.harvest.exceptions import HarvesterConcurrencyError, HarvesterDisabledError |
22 | 24 | from share.models import HarvestLog |
23 | 25 | from share.models import Source |
| 26 | +from share.models import SourceStat, SourceStatOAI |
24 | 27 | from share.models import RawDatum, NormalizedData, ChangeSet, CeleryTask, CeleryProviderTask, ShareUser, SourceConfig |
25 | 28 |
|
26 | 29 |
|
@@ -407,3 +410,159 @@ def do_run(self, last_run=None, **kwargs): |
407 | 410 | bot = self.config.get_bot(self.started_by, last_run=last_run, **kwargs) |
408 | 411 | logger.info('Running bot %s. Started by %s', bot, self.started_by) |
409 | 412 | bot.run() |
| 413 | + |
| 414 | + |
| 415 | +NAMESPACES = { |
| 416 | + 'dc': 'http://purl.org/dc/elements/1.1/', |
| 417 | + 'ns0': 'http://www.openarchives.org/OAI/2.0/', |
| 418 | + 'oai_dc': 'http://www.openarchives.org/OAI/2.0/oai_dc/', |
| 419 | +} |
| 420 | + |
| 421 | + |
| 422 | +def get_field_from_identify(response, field): |
| 423 | + parsed = etree.fromstring(response.content, parser=etree.XMLParser(recover=True)) |
| 424 | + return parsed.xpath('//ns0:Identify/ns0:{}'.format(field), namespaces=NAMESPACES)[0].text |
| 425 | + |
| 426 | + |
| 427 | +def assert_no_exception(url, timeout=15.0): |
| 428 | + try: |
| 429 | + r = requests.get(url, timeout=timeout) |
| 430 | + # except all exception and log |
| 431 | + except Exception as e: |
| 432 | + logger.warning('Exception received from source: %s', e) |
| 433 | + return (None, e) |
| 434 | + return (r, None) |
| 435 | + |
| 436 | + |
| 437 | +@celery.task(bind=True) |
| 438 | +def get_oai_source_stats(self, *args, **kwargs): |
| 439 | + ''' |
| 440 | + Known incorrect baseUrl: |
| 441 | + 'https://biblio.ugent.be/oai', # homepage listed |
| 442 | + 'http://purr.purdue.edu/oaipmh', # homepage listed |
| 443 | + 'https://www.ncbi.nlm.nih.gov/pmc/oai/oai.cgi', # incorrect url listed |
| 444 | + 'https://mla.hcommons.org/deposits/oai/', # incorrect url listed |
| 445 | + 'http://oai.repec.org', # listed url redirects to this |
| 446 | +
|
| 447 | +
|
| 448 | + Known incorrect earliestDatestamp (all emailed): |
| 449 | + 'edu.oaktrust.mods', # returns the most recent datestamp |
| 450 | + 'edu.scholarsarchiveosu.mods', # returns 0011-01-01 |
| 451 | + 'edu.uwashington.mods', # returns 2083-03-01 |
| 452 | + 'gov.nodc', # 1996-10-09 |
| 453 | + 'org.philpapers', # 1990-01-01T00:00:00Z |
| 454 | + 'org.ttu.mods', # 1989-05-01T05:00:00Z |
| 455 | + 'edu.umich.mods', # 1983-01-01T05:00:00Z |
| 456 | + 'edu.citeseerx', # 1970-01-01 |
| 457 | + 'br.pcurio', # 1970-01-01 |
| 458 | + 'edu.vtech.mods', # 1900-02-02T05:00:00Z |
| 459 | + 'edu.icpsr', # 01-01-1900 |
| 460 | + 'pt.rcaap', # 1900-01-01T00:00:00Z |
| 461 | + 'org.ucescholarship', # 1885-05-01 |
| 462 | + 'com.nature', # 1869-11-04 |
| 463 | + ''' |
| 464 | + |
| 465 | + base_url_config = kwargs['base_url'] |
| 466 | + base_url_source = None |
| 467 | + base_urls_match = False |
| 468 | + config_id = kwargs['id'] |
| 469 | + earliest_datestamp_config = kwargs['earliest_date'] or None |
| 470 | + earliest_datestamp_source = None |
| 471 | + earliest_datestamps_match = False |
| 472 | + response_exception = True |
| 473 | + response_elapsed_time = 0 |
| 474 | + response_status_code = None |
| 475 | + |
| 476 | + response, error = assert_no_exception(furl(base_url_config).set({'verb': 'Identify'}).url) |
| 477 | + response_exception = error |
| 478 | + |
| 479 | + if response: |
| 480 | + base_url_source = get_field_from_identify(response, 'baseURL') |
| 481 | + # ignores http vs https |
| 482 | + if len(base_url_source.split('://', 1)) > 1: |
| 483 | + base_urls_match = base_url_source.split('://', 1)[1] == base_url_config.split('://', 1)[1] |
| 484 | + else: |
| 485 | + logger.warning('Source baseURL is improper: %s', base_url_source) |
| 486 | + |
| 487 | + earliest_datestamp_identify = get_field_from_identify(response, 'earliestDatestamp') |
| 488 | + earliest_datestamp_source = pendulum.parse(earliest_datestamp_identify).to_date_string() if earliest_datestamp_identify else None |
| 489 | + earliest_datestamps_match = earliest_datestamp_config == earliest_datestamp_source |
| 490 | + |
| 491 | + response_elapsed_time = response.elapsed.total_seconds() |
| 492 | + response_status_code = response.status_code |
| 493 | + |
| 494 | + source_stat = SourceStatOAI.objects.create( |
| 495 | + earliest_datestamp_source=earliest_datestamp_source, |
| 496 | + earliest_datestamp_config=earliest_datestamp_config, |
| 497 | + earliest_datestamps_match=earliest_datestamps_match, |
| 498 | + |
| 499 | + base_url_source=base_url_source, |
| 500 | + base_url_config=base_url_config, |
| 501 | + base_urls_match=base_urls_match, |
| 502 | + |
| 503 | + config_id=config_id, |
| 504 | + |
| 505 | + response_status_code=response_status_code, |
| 506 | + response_elapsed_time=response_elapsed_time, |
| 507 | + response_exception=response_exception, |
| 508 | + ) |
| 509 | + source_stat.save() |
| 510 | + |
| 511 | + |
| 512 | +@celery.task(bind=True) |
| 513 | +def get_source_stats(self, *args, **kwargs): |
| 514 | + base_url_config = kwargs['base_url'] |
| 515 | + config_id = kwargs['id'] |
| 516 | + earliest_datestamp_config = kwargs['earliest_date'] or None |
| 517 | + response_exception = True |
| 518 | + response_elapsed_time = None |
| 519 | + response_status_code = None |
| 520 | + |
| 521 | + response, error = assert_no_exception(base_url_config) |
| 522 | + response_exception = error |
| 523 | + |
| 524 | + if response: |
| 525 | + response_elapsed_time = response.elapsed.total_seconds() |
| 526 | + response_status_code = response.status_code |
| 527 | + |
| 528 | + source_stat = SourceStatOAI.objects.create( |
| 529 | + earliest_datestamp_config=earliest_datestamp_config, |
| 530 | + base_url_config=base_url_config, |
| 531 | + config_id=config_id, |
| 532 | + response_status_code=response_status_code, |
| 533 | + response_elapsed_time=response_elapsed_time, |
| 534 | + response_exception=response_exception, |
| 535 | + ) |
| 536 | + source_stat.save() |
| 537 | + |
| 538 | + |
| 539 | +class SourceStatusTask(celery.Task): |
| 540 | + |
| 541 | + def run(self): |
| 542 | + self.get_oai_stats.apply_async() |
| 543 | + self.get_non_oai_stats.apply_async() |
| 544 | + |
| 545 | + @celery.task(bind=True) |
| 546 | + def get_oai_stats(self): |
| 547 | + oai_sourceconfigs = SourceConfig.objects.filter( |
| 548 | + disabled=False, |
| 549 | + base_url__isnull=False, |
| 550 | + harvester__key='oai' |
| 551 | + ) |
| 552 | + |
| 553 | + for config in oai_sourceconfigs.values(): |
| 554 | + config['earliest_date'] = config['earliest_date'].isoformat() if config['earliest_date'] else None |
| 555 | + get_oai_source_stats.apply_async(kwargs=config) |
| 556 | + |
| 557 | + @celery.task(bind=True) |
| 558 | + def get_non_oai_stats(self): |
| 559 | + non_oai_sourceconfigs = SourceConfig.objects.filter( |
| 560 | + disabled=False, |
| 561 | + base_url__isnull=False |
| 562 | + ).exclude( |
| 563 | + harvester__key='oai' |
| 564 | + ) |
| 565 | + |
| 566 | + for config in non_oai_sourceconfigs.values(): |
| 567 | + config['earliest_date'] = config['earliest_date'].isoformat() if config['earliest_date'] else None |
| 568 | + get_source_stats.apply_async(kwargs=config) |
0 commit comments