Skip to content

Commit 32f0e5e

Browse files
committed
clean up and update docs.
* make DOI uppercase always. * add docs to `get_real_url_from_doi`. * remove types (will add back with typing) * added API to generated docs.
1 parent e61c84f commit 32f0e5e

5 files changed

Lines changed: 48 additions & 42 deletions

File tree

docs/api.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
API
2+
===
3+
4+
.. automodule:: doi
5+
:members:

docs/conf.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
'sphinx.ext.todo',
4141
'dollarmath',
4242
'sphinx.ext.inheritance_diagram',
43+
'sphinx_autodoc_typehints',
4344
]
4445
if os.getenv('SPELLCHECK'):
4546
extensions += 'sphinxcontrib.spelling',

docs/index.rst

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,7 @@ Welcome to python-doi's documentation!
88
readme
99
contributing
1010
authors
11-
API
12-
===
11+
api
1312

1413
.. toctree::
1514
:maxdepth: 1

setup.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,8 @@ def get_version(filename):
2222

2323
dev_requirements = [
2424
'coverage', 'pytest', 'pytest-cov==2.5.0', 'twine', 'pep8',
25-
'flake8', 'wheel', 'sphinx', 'sphinx-autobuild', 'sphinx_rtd_theme']
25+
'flake8', 'wheel',
26+
'sphinx', 'sphinx-autobuild', 'sphinx-autodoc-typehints', 'sphinx_rtd_theme']
2627

2728
version = get_version('./src/doi/__init__.py')
2829

@@ -34,9 +35,10 @@ def get_version(filename):
3435
'Intended Audience :: Developers',
3536
'License :: OSI Approved :: GNU General Public License v3 (GPLv3)',
3637
'Natural Language :: English',
37-
'Programming Language :: Python :: 3.4',
3838
'Programming Language :: Python :: 3.5',
3939
'Programming Language :: Python :: 3.6',
40+
'Programming Language :: Python :: 3.7',
41+
'Programming Language :: Python :: 3.8',
4042
],
4143
description="Python package to work with Document Object Identifier (doi)",
4244
install_requires=requirements,

src/doi/__init__.py

Lines changed: 37 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,67 +1,59 @@
11
import re
2+
import sys
23
import logging
34

4-
55
__version__ = '0.1.1'
6-
7-
86
logger = logging.getLogger("doi")
97

108

11-
def pdf_to_doi(filepath, maxlines=float('inf')):
12-
"""Try to get doi from a filepath, it looks for a regex in the binary
13-
data and returns the first doi found, in the hopes that this doi
9+
def pdf_to_doi(filepath, maxlines=None):
10+
"""Try to get DOI from a filepath. It looks for a regex in the binary
11+
data and returns the first DOI found, in the hopes that this DOI
1412
is the correct one.
1513
16-
:param filepath: Path to the pdf file
17-
:type filepath: str
14+
:param filepath: Path to the pdf file.
1815
:param maxlines: Maximum number of lines that should be checked
19-
For some documnets, it would spend a long time trying to look for
20-
a doi, and dois in the middle of documents don't tend to be the correct
21-
doi of the document.
22-
:type maxlines: int
23-
:returns: DOI or None
24-
:rtype: str or None
16+
For some documents, it could spend a long time trying to look for
17+
a DOI, and DOIs in the middle of documents don't tend to be the correct
18+
DOI of the document.
19+
:returns: DOI or ``None``.
2520
"""
21+
if maxlines is None:
22+
maxlines = sys.maxsize
23+
2624
with open(filepath, 'rb') as fd:
2725
for j, line in enumerate(fd):
2826
doi = find_doi_in_text(line.decode('ascii', errors='ignore'))
2927
if doi:
3028
return doi
3129
if j > maxlines:
3230
return None
33-
else:
34-
return None
31+
return None
3532

3633

3734
def validate_doi(doi):
38-
"""We check that the DOI can be resolved by official means. If so, we
39-
return the resolved URL, otherwise, we return None (which means the DOI is
40-
invalid).
41-
42-
http://www.doi.org/factsheets/DOIProxy.html
35+
"""We check that the DOI can be resolved by
36+
`official means <http://www.doi.org/factsheets/DOIProxy.html>`_. If so, we
37+
return the resolved URL, otherwise, we return ``None`` (which means the
38+
DOI is invalid).
4339
44-
:param doi: Doi identificator
45-
:type doi: str
46-
:returns: It returns the url assigned to the doi if everything went right
47-
:rtype: str
48-
49-
:raises ValueError: Whenever the doi is not valid
40+
:param doi: Identifier.
41+
:returns: The URL assigned to the DOI or ``None``.
5042
"""
5143
from urllib.error import HTTPError, URLError
5244
import urllib.request
5345
import urllib.parse
5446
import json
5547
url = "https://doi.org/api/handles/{doi}".format(doi=doi)
56-
logger.debug('handle url %s' % url)
48+
logger.debug('handle url %s', url)
5749
request = urllib.request.Request(url)
5850

5951
try:
6052
result = json.loads(urllib.request.urlopen(request).read().decode())
6153
if 'values' in result:
62-
url = [v['data']['value']
54+
urls = [v['data']['value']
6355
for v in result['values'] if v.get('type') == 'URL']
64-
return url[0] if url else None
56+
return urls[0] if urls else None
6557
except HTTPError:
6658
raise ValueError('HTTP 404: DOI not found')
6759
except URLError as e:
@@ -80,11 +72,11 @@ def validate_doi(doi):
8072

8173

8274
def get_clean_doi(doi):
83-
"""Check if doi is actually a url and in that case just get
84-
the exact doi.
75+
"""Check if the DOI is actually a URL and in that case just get
76+
the exact DOI.
8577
86-
:doi: String containing a doi
87-
:returns: The pure doi
78+
:param doi: String containing a DOI.
79+
:returns: The extracted DOI.
8880
"""
8981
doi = re.sub(r'%2F', '/', doi)
9082
# For pdfs
@@ -96,8 +88,10 @@ def get_clean_doi(doi):
9688

9789

9890
def find_doi_in_text(text):
99-
"""
100-
Try to find a doi in a text
91+
"""Try to find a DOI in a text.
92+
93+
:param text: Text in which to look for DOI.
94+
:returns: A DOI, if found, otherwise ``None``.
10195
"""
10296
text = get_clean_doi(text)
10397
forbidden_doi_characters = r'"\s%$^\'<>@,;:#?&'
@@ -126,11 +120,16 @@ def find_doi_in_text(text):
126120

127121

128122
def get_real_url_from_doi(doi):
123+
"""Get a URL corresponding to a DOI.
124+
125+
:param doi: Identifier.
126+
:returns: A URL for the DOI. If the DOI is invalid, return ``None``.
127+
"""
129128
url = validate_doi(doi)
130-
if not url:
129+
if url is None:
131130
return url
132131

133-
m = re.match('.*linkinghub\.elsevier.*/pii/([A-Z0-9]+).*', url, re.I)
132+
m = re.match(r'.*linkinghub\.elsevier.*/pii/([A-Z0-9]+).*', url, re.I)
134133
if m:
135134
return ('https://www.sciencedirect.com/science/article/abs/pii/{pii}'
136135
.format(pii=m.group(1)))

0 commit comments

Comments
 (0)