Object Counts Over TimeΒΆ

Show the number of objects accessible through DataONE over time.

This process uses the solr index to identify the number of different types of objects available

[1]:
import requests
import json
import pprint
from datetime import datetime
import dateutil

SOLR_TIME_FORMAT = "%Y-%m-%dT%H:%M:%SZ"

T_NOW = datetime.utcnow()
T_START = datetime(2012,7,1)
[2]:
import sys
print(sys.version)
3.7.0 (default, Jun 28 2018, 07:39:16)
[Clang 4.0.1 (tags/RELEASE_401/final)]
[3]:
def getObjectCounts(t_start=None, t_end=None):
    results = {
        "metadata": 0,
        "data": 0,
        "resource": 0,
    }
    url = "https://cn.dataone.org/cn/v2/query/solr/"
    params = {
        "q":"-obsoletedBy:[* TO *]",
        "rows":"0",
        "wt":"json",
        "indent":"on",
        "facet":"on",
        "facet.field": "formatType",
        "facet.mincount": 1,
             }
    dq = None
    if t_start is not None:
        st = f"{t_start:{SOLR_TIME_FORMAT}}"
        if t_end is None:
            dq = f"dateUploaded:[\"{st}\" TO \"{T_NOW}\"]"
        else:
            dq = f"dateUploaded:[\"{st}\" TO \"{t_end:{SOLR_TIME_FORMAT}}\"]"
    elif t_end is not None:
        et = f"{t_end:{SOLR_TIME_FORMAT}}"
        dq = f"dateUploaded:[* TO \"{et}\"]"
    if dq is not None:
        params["q"] = params["q"] + " AND " + dq
    response = requests.get(url, params=params)
    data = json.loads(response.text)
    ftcounts = data["facet_counts"]["facet_fields"]["formatType"]
    for i in range(0, len(ftcounts),2):
        ft = ftcounts[i].lower()
        results[ft] = ftcounts[i+1]
    return results


[4]:
getObjectCounts()
[4]:
{'metadata': 804196, 'data': 1172369, 'resource': 198233}
[5]:

c_date = T_START object_count = [] columns = ["date", "metadata", "data", "resource"] print(",".join(columns)) while c_date < T_NOW: counts = getObjectCounts(t_end = c_date) entry = (c_date, counts["metadata"], counts["data"], counts["resource"]) print(f"{entry[0]:%Y-%m-%d},{entry[1]},{entry[2]},{entry[3]}") object_count.append(entry) c_date = c_date + dateutil.relativedelta.relativedelta(months=+1)
date,metadata,data,resource
2012-07-01,53935,37983,33271
2012-08-01,56013,38337,33636
2012-09-01,57153,38734,33935
2012-10-01,58170,39212,34219
2012-11-01,59935,39850,34667
2012-12-01,61498,40576,35250
2013-01-01,63165,40849,36256
2013-02-01,64988,41559,37858
2013-03-01,67136,42468,38513
2013-04-01,78131,43175,48342
2013-05-01,79531,43720,48726
2013-06-01,81426,44486,49083
2013-07-01,82835,45198,49456
2013-08-01,84211,45827,49855
2013-09-01,85626,46494,50189
2013-10-01,101524,91431,65308
2013-11-01,102953,93025,65718
2013-12-01,104262,106207,65993
2014-01-01,105829,107285,66378
2014-02-01,107460,108266,66858
2014-03-01,109182,109013,67379
2014-04-01,110895,110098,68106
2014-05-01,112443,110996,68415
2014-06-01,113896,111644,68789
2014-07-01,115695,112643,69204
2014-08-01,117569,113481,69751
2014-09-01,119238,114091,70542
2014-10-01,121160,147466,71030
2014-11-01,139784,225202,88338
2014-12-01,141679,502607,88847
2015-01-01,144159,508124,89710
2015-02-01,146250,558404,90352
2015-03-01,170810,620899,110496
2015-04-01,174242,649892,111793
2015-05-01,177645,660905,113413
2015-06-01,185625,720123,119226
2015-07-01,192192,790147,123704
2015-08-01,200084,836458,128511
2015-09-01,208453,849627,134790
2015-10-01,213515,855994,138041
2015-11-01,216298,857573,138842
2015-12-01,220646,864802,141069
2016-01-01,223302,868777,141986
2016-02-01,226123,870414,142989
2016-03-01,231065,873424,144912
2016-04-01,260480,878181,147397
2016-05-01,267149,880579,151778
2016-06-01,272268,882501,153953
2016-07-01,277060,885499,156556
2016-08-01,282571,887481,158336
2016-09-01,290742,891994,162117
2016-10-01,294872,893848,163689
2016-11-01,307446,900447,165220
2016-12-01,312058,915351,167228
2017-01-01,318793,927216,169008
2017-02-01,322898,937484,170277
2017-03-01,326970,947631,171784
2017-04-01,334848,953745,173059
2017-05-01,342194,956625,173989
2017-06-01,349928,960240,175346
2017-07-01,354602,971766,176519
2017-08-01,359828,1057937,177739
2017-09-01,581688,1088075,180015
2017-10-01,592941,1094996,181647
2017-11-01,605424,1109878,182794
2017-12-01,613787,1112706,183967
2018-01-01,642924,1114663,184872
2018-02-01,674201,1120909,187740
2018-03-01,698868,1125055,189670
2018-04-01,707816,1127915,191504
2018-05-01,791613,1141153,192761
2018-06-01,794866,1150686,194455
2018-07-01,796735,1155786,195219
2018-08-01,799284,1161092,196260
2018-09-01,802460,1163023,197085
2018-10-01,803570,1170337,197600
2018-11-01,803734,1171411,197761
2018-12-01,804158,1172252,198195
[7]:
import pandas as pd
import matplotlib.pyplot as plt
data_frame = pd.DataFrame(object_count, columns=columns)
data_frame.set_index('date', inplace=True)

plot = data_frame.plot(figsize=(10,4))
plot.set_xlabel("Date")
plot.grid(True)
../_images/PROJ_metrics_object_counts_6_0.png
[ ]: