#! /usr/bin/python3 """ A set of ranges for dates: *-1959, 1960-79, 1980-89, 1990-99, 2000-* + Hardcoded lists of values for 3 ISTEX API fields: - language - categories.wos - genre why? ----- The fields are often used as representativity criteria (the counts for each of their values can be used as quotas for a proportional sample) NB --- The values-lists could be retrieved by a terms facet aggregation but the API truncates them at count 10... Unless something changes, we'll store a simplified copy here. (Last copy from API + pruning 15/07/2015) """ __author__ = "Romain Loth" __copyright__ = "Copyright 2014-5 INIST-CNRS (ISTEX project)" __license__ = "LGPL" __version__ = "0.2" __email__ = "romain.loth@inist.fr" __status__ = "Dev" # ---------------------------------------------------------------------- # fields allowed as criteria # (grouped according to the method we use for value listing) # ---------------------------------------------------------------------- # auto value-listing via facet query TERMFACET_FIELDS = [ 'corpusName', 'qualityIndicators.pdfVersion', 'qualityIndicators.refBibsNative', 'language', 'genre', 'categories.wos', 'host.title', 'host.issn', 'host.isbn', 'serie.title', 'serie.issn', 'serie.isbn' ] # binned listing via ranges (also in field_value_lists.py) RANGEFACET_FIELDS = [ 'publicationDate', 'copyrightDate', 'qualityIndicators.pdfCharCount', 'qualityIndicators.pdfWordCount' ] KNOWN_FIELDS = TERMFACET_FIELDS + RANGEFACET_FIELDS ## target genre list -------------------------------- 2 #GENRE = ( # "article-commentary", # ARTICLE # "brief-report", # ARTICLE # "case-report", # ARTICLE # "meeting-report", # ARTICLE # "rapid-communication", # ARTICLE # "research-article", # ARTICLE # "review-article", # ARTICLE # # # "abstract", # AUTRES # # "book-review", # AUTRES # # "letter", # AUTRES # # # "e-book", # EBOOK # ) ### or simply major doctype groups # GENRE = ("ARTICLE","EBOOK","AUTRES") ### or heuristic for {article ; others} ### (only problem: cannot take nature letters :/ ) GENRE = ( "(article OR paper)", "((NOT article) AND (NOT paper))" ) ### no need for wildcards because field is tokenized and possible values as in following list (of 2015-09-23) # 230919 bmj research-article # 149297 bmj letter # 65591 bmj other # 42449 bmj book-review # 33521 bmj abstract # 19716 bmj editorial # 207613 ecco Primary Document # 102688 nature letter <=> article-like but missing # 87637 nature nw # 35151 nature nv # 27468 nature book review # 315704 oup other # 277998 oup book-review # 243736 oup research-article # 187142 oup reply # 28312 oup letter # 827724 springer Original Paper # 129795 springer Brief Communication # 50527 springer Review Paper # 2712569 wiley Serial article # DATE --------------------------------------------- 3 # for dates there's no categories but bins # use case: range => bins => quotas #~ DATE = ( #~ ("*", 1959), #~ (1960, 1979), #~ (1980, 1989), #~ (1990, 1999), #~ (2000, "*") #~ ) DATE = ( ("*", 1979), (1980, 1999), (2000, "*") ) # NBC ---------------------------------------------- 5 # bins again for number of chars <=> NBC <=> qualityIndicators.pdfCharCount NBC = ( ("*", 1999), (2000, "*") ) # NBC ---------------------------------------------- 5 # bins again for number of chars <=> NBC <=> qualityIndicators.pdfCharCount NBW = ( ("*", 499), (500, "*") )