Skip to content

Commit 2de91ab

Browse files
authored
Merge pull request #49 from MDAnalysis/docs-add-contrib
doc updates: contributing datasets and generate authors list from AUTHORS
2 parents 56f9ae0 + ea62356 commit 2de91ab

10 files changed

Lines changed: 360 additions & 32 deletions

File tree

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
MDAnalysisData/authors.py

AUTHORS

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,19 @@ Authors of MDAnalysisData
33

44
Contributors to MDAnalysisData are listed under the year of their
55
first contribution. Full names, GitHub handles (@name) or email
6-
addresses are all optional.
6+
addresses (all in parentheses) are all optional.
7+
8+
Chronological list of authors
9+
-----------------------------
710

811
2018
9-
- Oliver Beckstein @orbeckst
10-
- Shujie Fan @VOD555
11-
- Micaela Matta @micaela-matta
12-
- Richard J. Gowers @richardjgowers
12+
13+
- Oliver Beckstein (@orbeckst)
14+
- Shujie Fan (@VOD555)
15+
- Micaela Matta (@micaela-matta)
16+
- Richard J. Gowers (@richardjgowers)
1317

1418
2020
15-
- Lily Wang @lilyminium
19+
20+
- Lily Wang (@lilyminium)
21+

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
66

77
## [0.8.1] - YYYY-MM-DD
88

9+
### Added
10+
- docs for how to contribute a new dataset (#46)
11+
- new `MDAnalysis.__authors__` attribute with the list of AUTHORS
12+
913
### Changes
1014
- update online docs theme (#43)
1115

MDAnalysisData/__init__.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,15 @@
99

1010
from . import datasets
1111

12-
13-
14-
15-
1612
from ._version import get_versions
1713
__version__ = get_versions()['version']
1814
del get_versions
15+
16+
try:
17+
from .authors import __authors__
18+
except ImportError:
19+
import warnings
20+
warnings.warn('Could not find authors.py, __authors__ will be the '
21+
'generic MDAnalysis team.')
22+
__authors__ = ["The MDAnalysis Development Team"]
23+
del warnings
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# -*- coding: utf-8 -*-
2+
3+
import sys
4+
from six.moves import reload_module
5+
import pytest
6+
7+
import MDAnalysisData
8+
9+
10+
def test_authors():
11+
authors = MDAnalysisData.__authors__
12+
assert authors[:5] == ["Oliver Beckstein", "Shujie Fan",
13+
"Richard J. Gowers", "Micaela Matta",
14+
"Lily Wang"]
15+
16+
17+
def test_default_authors(monkeypatch):
18+
# authors.py did not exist and could not be imported
19+
monkeypatch.setitem(sys.modules, 'MDAnalysisData.authors', None)
20+
with pytest.warns(UserWarning,
21+
match="Could not find authors.py, __authors__ will "
22+
"be the generic MDAnalysis team."):
23+
reload_module(MDAnalysisData)
24+
authors = MDAnalysisData.__authors__
25+
assert authors == ["The MDAnalysis Development Team"]
26+
27+
28+
def test_version():
29+
# very generic because versioneer will provide different strings depending
30+
# on the repository status
31+
assert isinstance(MDAnalysisData.__version__, str)

docs/conf.py

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -12,16 +12,19 @@
1212
# serve to show the default.
1313

1414
import sys, os
15+
import datetime
16+
17+
import sphinx_rtd_theme
1518

1619
# If extensions (or modules to document with autodoc) are in another directory,
1720
# add these directories to sys.path here. If the directory is relative to the
1821
# documentation root, use os.path.abspath to make it absolute, like shown here.
19-
#sys.path.insert(0, os.path.abspath('.'))
20-
22+
#
2123
# make sure sphinx always uses the current branch
2224
sys.path.insert(0, os.path.abspath('..'))
2325

24-
import sphinx_rtd_theme
26+
# to get authors and version
27+
import MDAnalysisData
2528

2629
# -- General configuration -----------------------------------------------------
2730

@@ -63,18 +66,19 @@
6366
master_doc = 'index'
6467

6568
# General information about the project.
69+
author_list = MDAnalysisData.__authors__
70+
authors = u', '.join(author_list[:-1]) + u', and ' + author_list[-1]
6671
project = u'MDAnalysisData'
67-
author = (u'Oliver Beckstein, Shujie Fan, Micaela Matta, '
68-
u'Richard J. Gowers, Lily Wang')
69-
copyright = u'2018-2020, ' + author
72+
now = datetime.datetime.now()
73+
copyright = u'2018-{}, '.format(now.year) + authors
7074

7175

7276
# The version info for the project you're documenting, acts as replacement for
7377
# |version| and |release|, also used in various other places throughout the
7478
# built documents.
7579
#
7680
# The full version, including alpha/beta/rc tags.
77-
release = __import__('MDAnalysisData').__version__
81+
release = MDAnalysisData.__version__
7882
# The short X.Y version.
7983
version = ".".join(release.split('.')[:2])
8084

@@ -240,7 +244,7 @@
240244
# author, documentclass [howto, manual, or own class]).
241245
latex_documents = [
242246
(master_doc, 'MDAnalysisData.tex', u'MDAnalysisData Documentation',
243-
author, 'manual'),
247+
authors, 'manual'),
244248
]
245249

246250
# The name of an image file (relative to this directory) to place at the top of
@@ -270,7 +274,7 @@
270274
# (source start file, name, description, authors, manual section).
271275
man_pages = [
272276
(master_doc, 'MDAnalysisData', u'MDAnalysisData Documentation',
273-
author.split(), 1)
277+
authors.split(), 1)
274278
]
275279

276280
# If true, show URL addresses after external links.
@@ -284,7 +288,7 @@
284288
# dir menu entry, description, category)
285289
texinfo_documents = [
286290
(master_doc, 'MDAnalysisData', u'MDAnalysisData Documentation',
287-
author, 'MDAnalysisData', 'Datasets for MDAnalysis',
291+
authors, 'MDAnalysisData', 'Datasets for MDAnalysis',
288292
'Miscellaneous'),
289293
]
290294

@@ -301,5 +305,5 @@
301305
# Configuration for intersphinx: refer to the Python standard library
302306
# and other packages used by MDAnalysis
303307
intersphinx_mapping = {'https://docs.python.org/': None,
304-
'https://www.mdanalysis.org/docs/': None,
308+
'https://docs.mdanalysis.org/stable': None,
305309
}

docs/contributing.rst

Lines changed: 196 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,196 @@
1+
.. -*- coding: utf-8 -*-
2+
.. _contributing:
3+
4+
===========================
5+
Contributing new datasets
6+
===========================
7+
8+
New datasets are very welcome and everybody is encouraged to make their
9+
datasets accessible via :mod:`MDAnalysisData`, regardless of the simulation
10+
package or analysis code that they use. Users are encouraged to cite the
11+
authors of the datasets.
12+
13+
:mod:`MDAnalysisData` does *not* store files and trajectories. Instead, it
14+
provides accessor code to seamlessly download (and cache) files from archives.
15+
16+
17+
Outline
18+
=======
19+
20+
When you contribute data then you have to do two things
21+
22+
1. **deposit data in an archive** under an `Open Data`_ compatible license
23+
(`CC0`_ or `CC-BY`_ preferred)
24+
2. **write accessor code** in :mod:`MDAnalysisData`
25+
26+
The accessor code needs the stable archive URL(s) for your files and SHA256
27+
checksums to check the integrity for any downloaded files. You will also add
28+
a description of your dataset.
29+
30+
31+
.. note::
32+
33+
We currently have code to work with the `figshare`_ archive so choosing
34+
*figshare* will be easiest. But it should be straightforward to add code to
35+
work with other archive-grade repositories such as `zenodo`_ or
36+
`DataDryad`_. Some universities also provide digital repositories that are
37+
suitable. Open an issue in the `Issue Tracker`_ for supporting other
38+
archives.
39+
40+
41+
Step-by-step instructions
42+
=========================
43+
44+
To add a new dataset deposit your data in a repository. Then open a *pull
45+
request* for the https://github.com/MDAnalysis/MDAnalysisData
46+
repository. Follow these steps:
47+
48+
STEP 1: Archival deposition
49+
---------------------------
50+
51+
Deposit *all* required files in an archive-grade repository such as
52+
`figshare`_.
53+
54+
.. Note::
55+
56+
The site must *provide stable download links* and *may not change the
57+
content during download* because we store a SHA256 :ref:`checksum<checksum>`
58+
to check file integrity.
59+
60+
Make sure to **choose an** `Open Data`_ **compatible license** such as CC0_ or
61+
`CC-BY`_.
62+
63+
Take note of the **direct download URL** for each of your files. It should be
64+
possible to obtain the file directly from a stable URL with :program:`curl` or
65+
:program:`wget`. As an example look at the dataset for
66+
:mod:`MDAnalysisData.adk_equilibrium` at DOI `10.6084/m9.figshare.5108170`_ (as
67+
shown in the :ref:`figure below<fig-figshare-adk>`). Especially note the
68+
*download* links of the DCD trajectory
69+
(https://ndownloader.figshare.com/files/8672074) and PSF topology files
70+
(https://ndownloader.figshare.com/files/8672230) as these links will be needed
71+
in the accessor code in :mod:`MDAnalysisData` in the next step.
72+
73+
.. _fig-figshare-adk:
74+
75+
.. figure:: images/figshare_adk_equilibrium.png
76+
77+
The AdK Equilbrium dataset on figshare DOI `10.6084/m9.figshare.5108170`_,
78+
highlighting the deposited trajectory and topology files. The *download*
79+
URLs are visible when hovering over a file's image.
80+
81+
82+
.. _`10.6084/m9.figshare.5108170`:
83+
https://doi.org/10.6084/m9.figshare.5108170
84+
85+
86+
87+
STEP 2: Add code and docs to MDAnalysisData
88+
-------------------------------------------
89+
90+
91+
1. Add a Python module ``{MODULE_NAME}.py`` with the name of your dataset
92+
(where ``{MODULE_NAME}`` is just a placeholder). As an example see
93+
`MDAnalysisData/adk_equilibrium.py`_, which becomes
94+
:mod:`MDAnalysisData.adk_equilibrium`). In many cases you can copy an
95+
existing module and adapt:
96+
97+
- text: describe your dataset
98+
- :data:`NAME`: name of the data set; will be used as a file name so do not use spaces etc
99+
- :data:`DESCRIPTION`: filename of the description file (which contains
100+
restructured text format, so needs to have suffix ``.rst``)
101+
- :data:`ARCHIVE`: dictionary containing
102+
:class:`~MDAnalysisData.base.RemoteFileMetadata` instances. Keys should
103+
describe the file type. Typically
104+
105+
- *topology*: topology file (PSF, TPR, ...)
106+
- *trajectory*: trajectory coordinate file (DCD, XTC, ...)
107+
- *structure* (optional): system with single frame of coordinates
108+
(typically PDB, GRO, CRD, ...)
109+
110+
- name of the :func:`fetch_{NAME}` function (where ``{NAME}`` is a suitable
111+
name to access your dataset)
112+
- docs of the :func:`fetch_{NAME}` function
113+
- calculate and store the reference :ref:`SHA256 checksum <checksum>` as
114+
described below
115+
116+
2. Add a description file (example:
117+
`MDAnalysisData/descr/adk_equilibrium.rst`_); copy an existing file and
118+
adapt. **Make sure to add license information.**
119+
3. Import your :func:`fetch_{NAME}` function in
120+
`MDAnalysisData/datasets.py`_. ::
121+
122+
from .{MODULE_NAME} import fetch_{NAME}
123+
124+
4. Add documentation ``{NAME}.rst`` in restructured text format under `docs/`_
125+
(take existing files as examples) and append ``{NAME}`` to the second
126+
``toctree`` section of the `docs/index.rst`_ file.
127+
128+
.. code-block:: reST
129+
130+
.. toctree::
131+
:maxdepth: 1
132+
:caption: Datasets
133+
:hidden:
134+
135+
adk_equilibrium
136+
adk_transitions
137+
...
138+
CG_fiber
139+
{NAME}
140+
141+
If your data set does not follow the same pattern as the example above (where
142+
each file is downloaded separately) then you have to write your own
143+
:func:`fetch_{NAME}` function. E.g., you might download a tar file and then
144+
unpack the file yourself. Use scikit-learn's `sklearn/datasets`_ as examples,
145+
make sure that your function sets appropriate attributes in the returned
146+
:class:`~MDAnalysisData.base.Bunch` of records, and fully document what is
147+
returned.
148+
149+
150+
.. _checksum:
151+
152+
RemoteFileMetadata and SHA256 checksum
153+
======================================
154+
155+
The :class:`~MDAnalysisData.base.RemoteFileMetadata` is used by
156+
:func:`~MDAnalysisData.base._fetch_remote` and it will check file integrity by
157+
computing a SHA256 checksum over each downloaded file with a stored reference
158+
checksum. **You must compute the reference checksum and store it in your**
159+
:class:`~MDAnalysisData.base.RemoteFileMetadata` data structure for each file.
160+
161+
Typically you will have a local copy of the files during testing. You can
162+
compute the SHA256 for a file ``FILENAME`` with the following code::
163+
164+
python import MDAnalysisData.base
165+
print(MDAnalysisData.base._sha256(FILENAME))
166+
167+
or from the commandline
168+
169+
.. code-block:: bash
170+
171+
python -c 'import MDAnalysisData; print(MDAnalysisData.base._sha256("FILENAME"))'
172+
173+
where ``FILENAME`` is the file that is stored in the archive.
174+
175+
176+
.. references
177+
178+
.. _`Open Data`: https://opendatacommons.org/
179+
.. _CC0: https://creativecommons.org/share-your-work/public-domain/cc0
180+
.. _CC-BY: https://creativecommons.org/licenses/by/4.0/
181+
.. _figshare: (https://figshare.com/
182+
.. _zenodo: https://zenodo.org/
183+
.. _DataDryad: https://www.datadryad.org/
184+
.. _`Issue Tracker`: https://github.com/MDAnalysis/MDAnalysisData/issues
185+
.. _`MDAnalysisData/adk_equilibrium.py`:
186+
https://github.com/MDAnalysis/MDAnalysisData/blob/master/MDAnalysisData/adk_equilibrium.py
187+
.. _`MDAnalysisData/descr/adk_equilibrium.rst`:
188+
https://github.com/MDAnalysis/MDAnalysisData/blob/master/MDAnalysisData/descr/adk_equilibrium.rst
189+
.. _`MDAnalysisData/datasets.py`:
190+
https://github.com/MDAnalysis/MDAnalysisData/blob/master/MDAnalysisData/datasets.py
191+
.. _`docs/`:
192+
https://github.com/MDAnalysis/MDAnalysisData/blob/master/docs/
193+
.. _`docs/index.rst`:
194+
https://github.com/MDAnalysis/MDAnalysisData/blob/master/docs/index.rst
195+
.. _`sklearn/datasets`:
196+
https://github.com/scikit-learn/scikit-learn/tree/master/sklearn/datasets
80.6 KB
Loading

0 commit comments

Comments
 (0)