-
Notifications
You must be signed in to change notification settings - Fork 70
Expand file tree
/
Copy pathedu_ageconsearch.py
More file actions
117 lines (96 loc) · 3.96 KB
/
edu_ageconsearch.py
File metadata and controls
117 lines (96 loc) · 3.96 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import logging
import dateutil
from bs4 import BeautifulSoup
from furl import furl
import pendulum
from share.harvest import BaseHarvester
logger = logging.getLogger('__name__')
class AgEconHarvester(BaseHarvester):
"""
Query Parameters:
month (MM)
year (YYYY)
order (oldestFirst or None)
starts_with (YYYY-MM-DD) they don't always have a day
top (page number)
Returns:
Page with nearest date
20 records/page
"""
VERSION = 1
fields = {
'title': 'title',
'other titles': 'other_titles',
'authors': 'authors',
'editors': 'editors',
'editors (email)': 'editors_email',
'authors (email)': 'authors_email',
'keywords': 'keywords',
'jel codes': 'jel_codes',
'issue date': 'issue_date',
'series/report no.': 'series_report_number',
'abstract': 'abstract',
'uri': 'uri',
'institution/association': 'institution_association',
'identifiers': 'identifiers',
'total pages': 'total_pages',
'from page': 'from_page',
'to page': 'to_page',
'notes': 'notes',
'collections:': 'collections',
}
# Request page with nearest date
def do_harvest(self, start_date: pendulum.Pendulum, end_date: pendulum.Pendulum):
return self.fetch_records(start_date, end_date)
# Fetch the list of work urls on a single result page and return results within date range
def fetch_records(self, start_date, end_date):
logger.info('Harvesting %s - %s', start_date, end_date)
logger.debug('Fetching page %s', self.config.base_url)
url = furl(self.config.base_url)
url.args['starts_with'] = start_date
r = self.requests.get(url.url)
r.raise_for_status()
within_date_range = True
while within_date_range:
document = BeautifulSoup(r.text, 'html.parser')
results = document.select('a[href^="/handle/"]')[1:]
for result in results:
url = 'http://ageconsearch.umn.edu{}'.format(result.attrs['href'])
work = self.fetch_work(url)
date_status = self.check_record_date(work['issue_date'], start_date, end_date)
# if date is > start_date continue and skip
if date_status == 'after':
continue
elif date_status == 'before':
within_date_range = False
return
yield work['primary_identifier'], work
r = self.requests.get('http://ageconsearch.umn.edu/{}'.format(document.find('a', string='Next page').attrs['href']))
def check_record_date(self, issue_date, start_date, end_date):
date_object = dateutil.parser.parse(issue_date, default=pendulum.create(2016, 1, 1))
if date_object < start_date.start_of('day'):
return 'before'
if date_object > end_date.end_of('day'):
return 'after'
return 'within'
# Pull data out of html
def fetch_work(self, url):
r = self.requests.get(url)
r.raise_for_status()
soup = BeautifulSoup(r.text, 'lxml')
data = {}
data['primary_identifier'] = soup.find('code').text
display_table = soup.find(class_='itemDisplayTable').find_all('tr')
for row in display_table:
label = row.find(class_='metadataFieldLabel').text.replace(':\xa0', '').lower()
value_object = row.find(class_='metadataFieldValue')
if value_object.string:
value = value_object.string
else:
contents = []
for content in value_object.contents:
contents.append(content.string or content)
# Feels a little hacky
value = [val for val in contents if val != BeautifulSoup('<br/>', 'lxml').br]
data[self.fields[label]] = value
return data