11import re
2+ import sys
23import logging
34
4-
55__version__ = '0.1.1'
6-
7-
86logger = logging .getLogger ("doi" )
97
108
11- def pdf_to_doi (filepath , maxlines = float ( 'inf' ) ):
12- """Try to get doi from a filepath, it looks for a regex in the binary
13- data and returns the first doi found, in the hopes that this doi
9+ def pdf_to_doi (filepath , maxlines = None ):
10+ """Try to get DOI from a filepath. It looks for a regex in the binary
11+ data and returns the first DOI found, in the hopes that this DOI
1412 is the correct one.
1513
16- :param filepath: Path to the pdf file
17- :type filepath: str
14+ :param filepath: Path to the pdf file.
1815 :param maxlines: Maximum number of lines that should be checked
19- For some documnets, it would spend a long time trying to look for
20- a doi, and dois in the middle of documents don't tend to be the correct
21- doi of the document.
22- :type maxlines: int
23- :returns: DOI or None
24- :rtype: str or None
16+ For some documents, it could spend a long time trying to look for
17+ a DOI, and DOIs in the middle of documents don't tend to be the correct
18+ DOI of the document.
19+ :returns: DOI or ``None``.
2520 """
21+ if maxlines is None :
22+ maxlines = sys .maxsize
23+
2624 with open (filepath , 'rb' ) as fd :
2725 for j , line in enumerate (fd ):
2826 doi = find_doi_in_text (line .decode ('ascii' , errors = 'ignore' ))
2927 if doi :
3028 return doi
3129 if j > maxlines :
3230 return None
33- else :
34- return None
31+ return None
3532
3633
3734def validate_doi (doi ):
38- """We check that the DOI can be resolved by official means. If so, we
39- return the resolved URL, otherwise, we return None (which means the DOI is
40- invalid).
41-
42- http://www.doi.org/factsheets/DOIProxy.html
35+ """We check that the DOI can be resolved by
36+ `official means <http://www.doi.org/factsheets/DOIProxy.html>`_. If so, we
37+ return the resolved URL, otherwise, we return ``None`` (which means the
38+ DOI is invalid).
4339
44- :param doi: Doi identificator
45- :type doi: str
46- :returns: It returns the url assigned to the doi if everything went right
47- :rtype: str
48-
49- :raises ValueError: Whenever the doi is not valid
40+ :param doi: Identifier.
41+ :returns: The URL assigned to the DOI or ``None``.
5042 """
5143 from urllib .error import HTTPError , URLError
5244 import urllib .request
5345 import urllib .parse
5446 import json
5547 url = "https://doi.org/api/handles/{doi}" .format (doi = doi )
56- logger .debug ('handle url %s' % url )
48+ logger .debug ('handle url %s' , url )
5749 request = urllib .request .Request (url )
5850
5951 try :
6052 result = json .loads (urllib .request .urlopen (request ).read ().decode ())
6153 if 'values' in result :
62- url = [v ['data' ]['value' ]
54+ urls = [v ['data' ]['value' ]
6355 for v in result ['values' ] if v .get ('type' ) == 'URL' ]
64- return url [0 ] if url else None
56+ return urls [0 ] if urls else None
6557 except HTTPError :
6658 raise ValueError ('HTTP 404: DOI not found' )
6759 except URLError as e :
@@ -80,11 +72,11 @@ def validate_doi(doi):
8072
8173
8274def get_clean_doi (doi ):
83- """Check if doi is actually a url and in that case just get
84- the exact doi .
75+ """Check if the DOI is actually a URL and in that case just get
76+ the exact DOI .
8577
86- :doi: String containing a doi
87- :returns: The pure doi
78+ :param doi: String containing a DOI.
79+ :returns: The extracted DOI.
8880 """
8981 doi = re .sub (r'%2F' , '/' , doi )
9082 # For pdfs
@@ -96,8 +88,10 @@ def get_clean_doi(doi):
9688
9789
9890def find_doi_in_text (text ):
99- """
100- Try to find a doi in a text
91+ """Try to find a DOI in a text.
92+
93+ :param text: Text in which to look for DOI.
94+ :returns: A DOI, if found, otherwise ``None``.
10195 """
10296 text = get_clean_doi (text )
10397 forbidden_doi_characters = r'"\s%$^\'<>@,;:#?&'
@@ -126,11 +120,16 @@ def find_doi_in_text(text):
126120
127121
128122def get_real_url_from_doi (doi ):
123+ """Get a URL corresponding to a DOI.
124+
125+ :param doi: Identifier.
126+ :returns: A URL for the DOI. If the DOI is invalid, return ``None``.
127+ """
129128 url = validate_doi (doi )
130- if not url :
129+ if url is None :
131130 return url
132131
133- m = re .match ('.*linkinghub\.elsevier.*/pii/([A-Z0-9]+).*' , url , re .I )
132+ m = re .match (r '.*linkinghub\.elsevier.*/pii/([A-Z0-9]+).*' , url , re .I )
134133 if m :
135134 return ('https://www.sciencedirect.com/science/article/abs/pii/{pii}'
136135 .format (pii = m .group (1 )))
0 commit comments