Merge pull request #49 from MDAnalysis/docs-add-contrib

orbeckst · web-flow · commit 2de91ab68311 · 2021-07-20T07:47:14.000-07:00
doc updates: contributing datasets and generate authors list from AUTHORS
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+MDAnalysisData/authors.py
diff --git a/AUTHORS b/AUTHORS
@@ -3,13 +3,19 @@ Authors of MDAnalysisData
 
 Contributors to MDAnalysisData are listed under the year of their
 first contribution. Full names, GitHub handles (@name) or email
-addresses are all optional.
+addresses (all in parentheses) are all optional.
+
+Chronological list of authors
+-----------------------------
 
 2018
-   - Oliver Beckstein @orbeckst
-   - Shujie Fan @VOD555
-   - Micaela Matta @micaela-matta
-   - Richard J. Gowers @richardjgowers
+
+  - Oliver Beckstein (@orbeckst)
+  - Shujie Fan (@VOD555)
+  - Micaela Matta (@micaela-matta)
+  - Richard J. Gowers (@richardjgowers)
 
 2020
-   - Lily Wang @lilyminium
+
+  - Lily Wang (@lilyminium)
+  
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [0.8.1] - YYYY-MM-DD
 
+### Added
+- docs for how to contribute a new dataset (#46)
+- new `MDAnalysis.__authors__` attribute with the list of AUTHORS
+
 ### Changes
 - update online docs theme (#43)
 
diff --git a/MDAnalysisData/__init__.py b/MDAnalysisData/__init__.py
@@ -9,10 +9,15 @@
 
 from . import datasets
 
-
-
-
-
 from ._version import get_versions
 __version__ = get_versions()['version']
 del get_versions
+
+try:
+    from .authors import __authors__
+except ImportError:
+    import warnings
+    warnings.warn('Could not find authors.py, __authors__ will be the '
+                  'generic MDAnalysis team.')
+    __authors__ = ["The MDAnalysis Development Team"]
+    del warnings
diff --git a/MDAnalysisData/tests/test_package_metadata.py b/MDAnalysisData/tests/test_package_metadata.py
@@ -0,0 +1,31 @@
+# -*- coding: utf-8 -*-
+
+import sys
+from six.moves import reload_module
+import pytest
+
+import MDAnalysisData
+
+
+def test_authors():
+    authors = MDAnalysisData.__authors__
+    assert authors[:5] == ["Oliver Beckstein", "Shujie Fan",
+                           "Richard J. Gowers", "Micaela Matta",
+                           "Lily Wang"]
+
+
+def test_default_authors(monkeypatch):
+    # authors.py did not exist and could not be imported
+    monkeypatch.setitem(sys.modules, 'MDAnalysisData.authors', None)
+    with pytest.warns(UserWarning,
+                      match="Could not find authors.py, __authors__ will "
+                            "be the generic MDAnalysis team."):
+        reload_module(MDAnalysisData)
+    authors = MDAnalysisData.__authors__
+    assert authors == ["The MDAnalysis Development Team"]
+
+
+def test_version():
+    # very generic because versioneer will provide different strings depending
+    # on the repository status
+    assert isinstance(MDAnalysisData.__version__, str)
diff --git a/docs/conf.py b/docs/conf.py
@@ -12,16 +12,19 @@
 # serve to show the default.
 
 import sys, os
+import datetime
+
+import sphinx_rtd_theme
 
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
-#sys.path.insert(0, os.path.abspath('.'))
-
+#
 # make sure sphinx always uses the current branch
 sys.path.insert(0, os.path.abspath('..'))
 
-import sphinx_rtd_theme
+# to get authors and version
+import MDAnalysisData
 
 # -- General configuration -----------------------------------------------------
 
@@ -63,18 +66,19 @@
 master_doc = 'index'
 
 # General information about the project.
+author_list = MDAnalysisData.__authors__
+authors = u', '.join(author_list[:-1]) + u', and ' + author_list[-1]
 project = u'MDAnalysisData'
-author = (u'Oliver Beckstein, Shujie Fan, Micaela Matta, '
-          u'Richard J. Gowers, Lily Wang')
-copyright = u'2018-2020, ' + author
+now = datetime.datetime.now()
+copyright = u'2018-{}, '.format(now.year) + authors
 
 
 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the
 # built documents.
 #
 # The full version, including alpha/beta/rc tags.
-release = __import__('MDAnalysisData').__version__
+release = MDAnalysisData.__version__
 # The short X.Y version.
 version = ".".join(release.split('.')[:2])
 
@@ -240,7 +244,7 @@
 #  author, documentclass [howto, manual, or own class]).
 latex_documents = [
     (master_doc, 'MDAnalysisData.tex', u'MDAnalysisData Documentation',
-     author, 'manual'),
+     authors, 'manual'),
 ]
 
 # The name of an image file (relative to this directory) to place at the top of
@@ -270,7 +274,7 @@
 # (source start file, name, description, authors, manual section).
 man_pages = [
     (master_doc, 'MDAnalysisData', u'MDAnalysisData Documentation',
-     author.split(), 1)
+     authors.split(), 1)
 ]
 
 # If true, show URL addresses after external links.
@@ -284,7 +288,7 @@
 #  dir menu entry, description, category)
 texinfo_documents = [
     (master_doc, 'MDAnalysisData', u'MDAnalysisData Documentation',
-     author, 'MDAnalysisData', 'Datasets for MDAnalysis',
+     authors, 'MDAnalysisData', 'Datasets for MDAnalysis',
      'Miscellaneous'),
 ]
 
@@ -301,5 +305,5 @@
 # Configuration for intersphinx: refer to the Python standard library
 # and other packages used by MDAnalysis
 intersphinx_mapping = {'https://docs.python.org/': None,
-                       'https://www.mdanalysis.org/docs/': None,
+                       'https://docs.mdanalysis.org/stable': None,
                        }
diff --git a/docs/contributing.rst b/docs/contributing.rst
@@ -0,0 +1,196 @@
+.. -*- coding: utf-8 -*-
+.. _contributing:
+
+===========================
+ Contributing new datasets
+===========================
+
+New datasets are very welcome and everybody is encouraged to make their
+datasets accessible via :mod:`MDAnalysisData`, regardless of the simulation
+package or analysis code that they use. Users are encouraged to cite the
+authors of the datasets.
+
+:mod:`MDAnalysisData` does *not* store files and trajectories. Instead, it
+provides accessor code to seamlessly download (and cache) files from archives.
+
+
+Outline
+=======
+
+When you contribute data then you have to do two things
+
+1. **deposit data in an archive** under an `Open Data`_ compatible license
+   (`CC0`_ or `CC-BY`_ preferred)
+2. **write accessor code** in :mod:`MDAnalysisData`
+
+   The accessor code needs the stable archive URL(s) for your files and SHA256
+   checksums to check the integrity for any downloaded files. You will also add
+   a description of your dataset.
+
+
+.. note::
+
+    We currently have code to work with the `figshare`_ archive so choosing
+    *figshare* will be easiest. But it should be straightforward to add code to
+    work with other archive-grade repositories such as `zenodo`_ or
+    `DataDryad`_. Some universities also provide digital repositories that are
+    suitable. Open an issue in the `Issue Tracker`_ for supporting other
+    archives.
+   
+
+Step-by-step instructions
+=========================
+
+To add a new dataset deposit your data in a repository. Then open a *pull
+request* for the https://github.com/MDAnalysis/MDAnalysisData
+repository. Follow these steps:
+
+STEP 1: Archival deposition
+---------------------------
+
+Deposit *all* required files in an archive-grade repository such as
+`figshare`_.
+
+.. Note::
+
+   The site must *provide stable download links* and *may not change the
+   content during download* because we store a SHA256 :ref:`checksum<checksum>`
+   to check file integrity.
+
+Make sure to **choose an** `Open Data`_ **compatible license** such as CC0_ or
+`CC-BY`_.
+
+Take note of the **direct download URL** for each of your files. It should be
+possible to obtain the file directly from a stable URL with :program:`curl` or
+:program:`wget`. As an example look at the dataset for
+:mod:`MDAnalysisData.adk_equilibrium` at DOI `10.6084/m9.figshare.5108170`_ (as
+shown in the :ref:`figure below<fig-figshare-adk>`). Especially note the
+*download* links of the DCD trajectory
+(https://ndownloader.figshare.com/files/8672074) and PSF topology files
+(https://ndownloader.figshare.com/files/8672230) as these links will be needed
+in the accessor code in :mod:`MDAnalysisData` in the next step.
+
+.. _fig-figshare-adk:
+
+.. figure:: images/figshare_adk_equilibrium.png
+
+   The AdK Equilbrium dataset on figshare DOI `10.6084/m9.figshare.5108170`_,
+   highlighting the deposited trajectory and topology files. The *download*
+   URLs are visible when hovering over a file's image.
+
+
+.. _`10.6084/m9.figshare.5108170`:
+   https://doi.org/10.6084/m9.figshare.5108170
+
+
+
+STEP 2: Add code and docs to MDAnalysisData
+-------------------------------------------
+
+
+1. Add a Python module ``{MODULE_NAME}.py`` with the name of your dataset
+   (where ``{MODULE_NAME}`` is just a placeholder). As an example see
+   `MDAnalysisData/adk_equilibrium.py`_, which becomes
+   :mod:`MDAnalysisData.adk_equilibrium`). In many cases you can copy an
+   existing module and adapt:
+   
+   - text: describe your dataset
+   - :data:`NAME`: name of the data set; will be used as a file name so do not use spaces etc
+   - :data:`DESCRIPTION`: filename of the description file (which contains
+     restructured text format, so needs to have suffix ``.rst``)
+   - :data:`ARCHIVE`: dictionary containing
+     :class:`~MDAnalysisData.base.RemoteFileMetadata` instances. Keys should
+     describe the file type. Typically
+
+     - *topology*: topology file (PSF, TPR, ...)
+     - *trajectory*: trajectory coordinate file (DCD, XTC, ...)
+     - *structure* (optional): system with single frame of coordinates
+       (typically PDB, GRO, CRD, ...)
+	  
+   - name of the :func:`fetch_{NAME}` function (where ``{NAME}`` is a suitable
+     name to access your dataset)
+   - docs of the :func:`fetch_{NAME}` function
+   - calculate and store the reference :ref:`SHA256 checksum <checksum>` as
+     described below
+     
+2. Add a description file (example:
+   `MDAnalysisData/descr/adk_equilibrium.rst`_); copy an existing file and
+   adapt. **Make sure to add license information.**
+3. Import your :func:`fetch_{NAME}` function in
+   `MDAnalysisData/datasets.py`_. ::
+
+      from .{MODULE_NAME} import fetch_{NAME}
+     
+4. Add documentation ``{NAME}.rst`` in restructured text format under `docs/`_
+   (take existing files as examples) and append ``{NAME}`` to the second
+   ``toctree`` section of the `docs/index.rst`_ file.
+
+   .. code-block:: reST
+
+      .. toctree::
+	 :maxdepth: 1
+	 :caption: Datasets
+	 :hidden:
+
+	 adk_equilibrium
+	 adk_transitions
+	 ...
+	 CG_fiber		   
+         {NAME}
+	 
+If your data set does not follow the same pattern as the example above (where
+each file is downloaded separately) then you have to write your own
+:func:`fetch_{NAME}` function. E.g., you might download a tar file and then
+unpack the file yourself. Use scikit-learn's `sklearn/datasets`_ as examples,
+make sure that your function sets appropriate attributes in the returned
+:class:`~MDAnalysisData.base.Bunch` of records, and fully document what is
+returned.
+
+
+.. _checksum:
+
+RemoteFileMetadata and SHA256 checksum
+======================================
+
+The :class:`~MDAnalysisData.base.RemoteFileMetadata` is used by
+:func:`~MDAnalysisData.base._fetch_remote` and it will check file integrity by
+computing a SHA256 checksum over each downloaded file with a stored reference
+checksum. **You must compute the reference checksum and store it in your**
+:class:`~MDAnalysisData.base.RemoteFileMetadata` data structure for each file.
+
+Typically you will have a local copy of the files during testing. You can
+compute the SHA256 for a file ``FILENAME`` with the following code::
+
+  python import MDAnalysisData.base
+  print(MDAnalysisData.base._sha256(FILENAME))
+
+or from the commandline
+
+.. code-block:: bash
+
+   python -c 'import MDAnalysisData; print(MDAnalysisData.base._sha256("FILENAME"))'
+
+where ``FILENAME`` is the file that is stored in the archive.
+
+
+.. references
+
+.. _`Open Data`: https://opendatacommons.org/
+.. _CC0: https://creativecommons.org/share-your-work/public-domain/cc0
+.. _CC-BY: https://creativecommons.org/licenses/by/4.0/
+.. _figshare: (https://figshare.com/
+.. _zenodo: https://zenodo.org/
+.. _DataDryad: https://www.datadryad.org/
+.. _`Issue Tracker`: https://github.com/MDAnalysis/MDAnalysisData/issues
+.. _`MDAnalysisData/adk_equilibrium.py`:
+   https://github.com/MDAnalysis/MDAnalysisData/blob/master/MDAnalysisData/adk_equilibrium.py
+.. _`MDAnalysisData/descr/adk_equilibrium.rst`:
+   https://github.com/MDAnalysis/MDAnalysisData/blob/master/MDAnalysisData/descr/adk_equilibrium.rst
+.. _`MDAnalysisData/datasets.py`:
+   https://github.com/MDAnalysis/MDAnalysisData/blob/master/MDAnalysisData/datasets.py
+.. _`docs/`:
+   https://github.com/MDAnalysis/MDAnalysisData/blob/master/docs/
+.. _`docs/index.rst`:
+   https://github.com/MDAnalysis/MDAnalysisData/blob/master/docs/index.rst
+.. _`sklearn/datasets`:
+   https://github.com/scikit-learn/scikit-learn/tree/master/sklearn/datasets
diff --git a/docs/images/figshare_adk_equilibrium.png b/docs/images/figshare_adk_equilibrium.png
diff --git a/docs/index.rst b/docs/index.rst
diff --git a/setup.py b/setup.py