SHARE/share/harvesters/edu_ageconsearch.py at 936369959a589d64f6e1542a44b9f601d0f7770d · CenterForOpenScience/SHARE · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import logging
import dateutil

from bs4 import BeautifulSoup
from furl import furl
import pendulum

from share.harvest import BaseHarvester

logger = logging.getLogger('__name__')


class AgEconHarvester(BaseHarvester):
    """
        Query Parameters:
            month (MM)
            year (YYYY)
            order (oldestFirst or None)
            starts_with (YYYY-MM-DD) they don't always have a day
            top (page number)

        Returns:
            Page with nearest date
            20 records/page
    """
    VERSION = 1

    fields = {
        'title': 'title',
        'other titles': 'other_titles',
        'authors': 'authors',
        'editors': 'editors',
        'editors (email)': 'editors_email',
        'authors (email)': 'authors_email',
        'keywords': 'keywords',
        'jel codes': 'jel_codes',
        'issue date': 'issue_date',
        'series/report no.': 'series_report_number',
        'abstract': 'abstract',
        'uri': 'uri',
        'institution/association': 'institution_association',
        'identifiers': 'identifiers',
        'total pages': 'total_pages',
        'from page': 'from_page',
        'to page': 'to_page',
        'notes': 'notes',
        'collections:': 'collections',
    }

    # Request page with nearest date
    def do_harvest(self, start_date: pendulum.Pendulum, end_date: pendulum.Pendulum):
        return self.fetch_records(start_date, end_date)

    # Fetch the list of work urls on a single result page and return results within date range
    def fetch_records(self, start_date, end_date):
        logger.info('Harvesting %s - %s', start_date, end_date)
        logger.debug('Fetching page %s', self.config.base_url)

        url = furl(self.config.base_url)
        url.args['starts_with'] = start_date
        r = self.requests.get(url.url)

        r.raise_for_status()
        within_date_range = True
        while within_date_range:
            document = BeautifulSoup(r.text, 'html.parser')
            results = document.select('a[href^="/handle/"]')[1:]
            for result in results:
                url = 'http://ageconsearch.umn.edu{}'.format(result.attrs['href'])
                work = self.fetch_work(url)
                date_status = self.check_record_date(work['issue_date'], start_date, end_date)

                # if date is > start_date continue and skip
                if date_status == 'after':
                    continue
                elif date_status == 'before':
                    within_date_range = False
                    return
                yield work['primary_identifier'], work

            r = self.requests.get('http://ageconsearch.umn.edu/{}'.format(document.find('a', string='Next page').attrs['href']))

    def check_record_date(self, issue_date, start_date, end_date):
        date_object = dateutil.parser.parse(issue_date, default=pendulum.create(2016, 1, 1))

        if date_object < start_date.start_of('day'):
            return 'before'
        if date_object > end_date.end_of('day'):
            return 'after'

        return 'within'

    # Pull data out of html
    def fetch_work(self, url):
        r = self.requests.get(url)
        r.raise_for_status()
        soup = BeautifulSoup(r.text, 'lxml')
        data = {}

        data['primary_identifier'] = soup.find('code').text
        display_table = soup.find(class_='itemDisplayTable').find_all('tr')

        for row in display_table:
            label = row.find(class_='metadataFieldLabel').text.replace(':\xa0', '').lower()
            value_object = row.find(class_='metadataFieldValue')
            if value_object.string:
                value = value_object.string
            else:
                contents = []
                for content in value_object.contents:
                    contents.append(content.string or content)
                # Feels a little hacky
                value = [val for val in contents if val != BeautifulSoup('<br/>', 'lxml').br]

            data[self.fields[label]] = value

        return data