DataONE ContributorsΒΆ

Examine the solr index to determine the number of contributors to content exposed by DataONE. The count provided here is likely higher than the actual number of contributors because the indexing process only performs minimal pre-processing of names added to the index. For example, in some cases names appear with both ASCII and Unicode variants and are treated as separate.

[5]:
import requests
import json
import pprint
from datetime import datetime
import dateutil
import nameparser

SOLR_TIME_FORMAT = "%Y-%m-%dT%H:%M:%SZ"

T_NOW = datetime.utcnow()
T_START = datetime(2012,7,1)

# provides a map from a solr entry to a more normalized form
contributor_index = {}
[6]:
def normalize_name(name):
    def strip_str(s):
        return s.strip(' :.,;()')

    parsed = nameparser.HumanName(name)
    initial = strip_str(parsed.middle).upper()
    if len(initial) > 1:
        initial = initial[0]
    given = strip_str(parsed.first).upper()
    if len(given) > 2:
        given = given[:3]
    return f"{strip_str(parsed.last).upper()}_{given}_{initial}"

def getContributors(t_start=None, t_end=None):
    url = "https://cn.dataone.org/cn/v2/query/solr/"
    params = {
        "q":"*:*",
        "facet":"on",
        "rows":"0",
        "facet.limit":"-1",
        "facet.field":"investigator",
        "wt":"json",
             }
    dq = None
    if t_start is not None:
        st = f"{t_start:{SOLR_TIME_FORMAT}}"
        if t_end is None:
            dq = f"dateUploaded:[\"{st}\" TO \"{T_NOW}\"]"
        else:
            dq = f"dateUploaded:[\"{st}\" TO \"{t_end:{SOLR_TIME_FORMAT}}\"]"
    else:
        et = f"{t_end:{SOLR_TIME_FORMAT}}"
        dq = f"dateUploaded:[* TO \"{et}\"]"
    if dq is not None:
        params["q"] = dq
    response = requests.get(url, params=params)
    data = json.loads(response.text)
    investigators = data["facet_counts"]["facet_fields"]["investigator"]
    names = []
    counts = []
    unames = {}
    for i in range(0, len(investigators), 2):
        if investigators[i] not in contributor_index:
            contributor_index[investigators[i]] = normalize_name(investigators[i])
        #print(f"{investigators[i]} : {contributor_index[investigators[i]]}")
        n = investigators[i+1]
        if n > 0:
            names.append(investigators[i])
            counts.append(investigators[i+1])
            nname = contributor_index[investigators[i]]
            unames[nname] = unames.get(nname,0) + n
    return names, counts, unames


[7]:
c_date = T_START
name_count = []
columns = ["date","contributors","unique"]
print(",".join(columns))
while c_date < T_NOW:
    names, counts, unames = getContributors(t_end = c_date)
    entry = (c_date, len(names), len(unames.keys()))
    print(f"{entry[0]:%Y-%m-%d},{entry[1]},{entry[2]}")
    name_count.append(entry)
    c_date = c_date + dateutil.relativedelta.relativedelta(months=+1)

date,contributors,unique
2012-07-01,10676,9304
2012-08-01,11835,10446
2012-09-01,12310,10888
2012-10-01,12722,11270
2012-11-01,13501,12006
2012-12-01,14083,12546
2013-01-01,14532,12952
2013-02-01,15344,13710
2013-03-01,16189,14505
2013-04-01,16865,15129
2013-05-01,17419,15623
2013-06-01,18375,16520
2013-07-01,19018,17130
2013-08-01,19658,17715
2013-09-01,20428,18430
2013-10-01,21421,19322
2013-11-01,22167,20033
2013-12-01,22831,20622
2014-01-01,23401,21162
2014-02-01,24338,21993
2014-03-01,25228,22782
2014-04-01,26032,23532
2014-05-01,26900,24350
2014-06-01,27628,25038
2014-07-01,28565,25882
2014-08-01,29560,26826
2014-09-01,30231,27452
2014-10-01,31694,28824
2014-11-01,32790,29856
2014-12-01,34659,31610
2015-01-01,36014,32878
2015-02-01,37110,33893
2015-03-01,38396,34970
2015-04-01,39941,36428
2015-05-01,41454,37838
2015-06-01,42865,39138
2015-07-01,44358,40544
2015-08-01,45758,41840
2015-09-01,47004,43004
2015-10-01,48472,44357
2015-11-01,49924,45713
2015-12-01,51176,46853
2016-01-01,52655,48188
2016-02-01,54005,49404
2016-03-01,55489,50766
2016-04-01,58751,53750
2016-05-01,60908,55241
2016-06-01,62549,56743
2016-07-01,63852,57922
2016-08-01,65059,59017
2016-09-01,66838,60595
2016-10-01,68334,61934
2016-11-01,71910,65104
2016-12-01,73637,66616
2017-01-01,78433,70965
2017-02-01,80177,72473
2017-03-01,81667,73807
2017-04-01,83468,75376
2017-05-01,85766,77424
2017-06-01,87704,79145
2017-07-01,89503,80735
2017-08-01,91252,82258
2017-09-01,117806,107285
2017-10-01,121396,110474
2017-11-01,125236,113861
2017-12-01,128070,116386
2018-01-01,132920,120822
2018-02-01,136090,123675
2018-03-01,138273,125459
2018-04-01,140471,127269
2018-05-01,142387,128921
2018-06-01,144035,130258
2018-07-01,145611,131579
2018-08-01,147540,133097
2018-09-01,149416,134701
2018-10-01,150282,135418
2018-11-01,150371,135449
2018-12-01,150423,135467
2019-01-01,150508,135499
2019-02-01,150588,135547
2019-03-01,150677,135602
2019-04-01,152300,136963
2019-05-01,152433,137040
2019-06-01,152544,137094
2019-07-01,152616,137115
2019-08-01,152703,137160
2019-09-01,152758,137179
2019-10-01,152779,137185
2019-11-01,152819,137192
2019-12-01,152885,137197
2020-01-01,153091,137349
2020-02-01,153178,137364
2020-03-01,153302,137435
2020-04-01,153375,137463
2020-05-01,153434,137483
2020-06-01,153529,137506
2020-07-01,153803,137607
2020-08-01,153933,137648
2020-09-01,154109,137722
2020-10-01,154181,137745
2020-11-01,154263,137762
2020-12-01,154345,137787
2021-01-01,154518,137865
2021-02-01,155460,138625
2021-03-01,155515,138643
2021-04-01,155579,138665
2021-05-01,155670,138683
2021-06-01,155734,138698
2021-07-01,155833,138727
2021-08-01,155900,138745
2021-09-01,155984,138766
2021-10-01,156064,138790
2021-11-01,156203,138806
2021-12-01,156298,138840
2022-01-01,156408,138885
2022-02-01,156551,138942
2022-03-01,156651,138967
2022-04-01,156725,138976
2022-05-01,156828,139000
2022-06-01,156882,139014
2022-07-01,156960,139042
2022-08-01,157061,139055
2022-09-01,157175,139077
2022-10-01,157256,139099
2022-11-01,157341,139118
2022-12-01,157390,139128
2023-01-01,157493,139171
2023-02-01,157563,139190
2023-03-01,157611,139198
2023-04-01,157667,139209
2023-05-01,157750,139227
2023-06-01,157825,139248
2023-07-01,157893,139262
2023-08-01,157941,139270
[8]:
import pandas as pd
import matplotlib.pyplot as plt
data_frame = pd.DataFrame(name_count, columns=columns)
data_frame.set_index('date', inplace=True)

plot = data_frame.plot(figsize=(10,4))
plot.set_xlabel("Date")
plot.grid(True)
../_images/PROJ_metrics_contributors_4_0.png
[ ]: