Newer
Older
libconsulte / field_value_lists.py
#! /usr/bin/python3
"""
A set of ranges for dates: *-1959, 1960-79, 1980-89, 1990-99, 2000-*
 +
Hardcoded lists of values for 3 ISTEX API fields:
 - language
 - categories.wos
 - genre

why?
-----
The fields are often used as representativity criteria (the counts for
each of their values can be used as quotas for a proportional sample)

NB
---
The values-lists could be retrieved by a terms facet aggregation but the
API truncates them at count 10... Unless something changes, we'll store
a simplified copy here.        (Last copy from API + pruning 15/07/2015)
"""
__author__    = "Romain Loth"
__copyright__ = "Copyright 2014-5 INIST-CNRS (ISTEX project)"
__license__   = "LGPL"
__version__   = "0.2"
__email__     = "romain.loth@inist.fr"
__status__    = "Dev"

# ----------------------------------------------------------------------
# fields allowed as criteria
# (grouped according to the method we use for value listing)
# ----------------------------------------------------------------------
# auto value-listing via facet query
TERMFACET_FIELDS = [
	'corpusName', 
	'qualityIndicators.pdfVersion', 
	'qualityIndicators.refBibsNative',
	'language',
	'genre',
	'categories.wos',
	'host.title',
	'host.issn',
	'host.isbn',
	'serie.title',
	'serie.issn',
	'serie.isbn'
	]

# binned listing via ranges (also in field_value_lists.py)
RANGEFACET_FIELDS = [
	'publicationDate',
	'copyrightDate',
	'qualityIndicators.pdfCharCount',
	'qualityIndicators.pdfWordCount'
	]

KNOWN_FIELDS = TERMFACET_FIELDS + RANGEFACET_FIELDS



## target genre list -------------------------------- 2
#GENRE = (
#	"article-commentary",        # ARTICLE
#	"brief-report",              # ARTICLE
#	"case-report",               # ARTICLE
#	"meeting-report",            # ARTICLE
#	"rapid-communication",       # ARTICLE
#	"research-article",          # ARTICLE
#	"review-article",            # ARTICLE
#	
#	# "abstract",          # AUTRES
#	# "book-review",       # AUTRES
#	# "letter",            # AUTRES
#	
#	# "e-book",            # EBOOK
#	)

### or simply major doctype groups
# GENRE = ("ARTICLE","EBOOK","AUTRES")


### or heuristic for {article ; others}
### (only problem: cannot take nature letters :/ )
GENRE = (
	"(article OR paper)",
	"((NOT article) AND (NOT paper))"
)
### no need for wildcards because field is tokenized and possible values as in following list (of 2015-09-23)
# 230919	bmj	research-article
# 149297	bmj	letter
# 65591	bmj	other
# 42449	bmj	book-review
# 33521	bmj	abstract
# 19716	bmj	editorial
# 207613	ecco	Primary Document
# 102688	nature	letter         <=> article-like but missing
# 87637	nature	nw
# 35151	nature	nv
# 27468	nature	book review
# 315704	oup	other
# 277998	oup	book-review
# 243736	oup	research-article
# 187142	oup	reply
# 28312	oup	letter
# 827724	springer	Original Paper
# 129795	springer	Brief Communication
# 50527	springer	Review Paper
# 2712569	wiley	Serial article




# DATE --------------------------------------------- 3
# for dates there's no categories but bins
# use case: range => bins => quotas
#~ DATE = (
	#~ ("*", 1959),
	#~ (1960, 1979),
	#~ (1980, 1989),
	#~ (1990, 1999),
	#~ (2000, "*")
	#~ )
DATE = (
	("*", 1979),
	(1980, 1999),
	(2000, "*")
	)


# NBC ---------------------------------------------- 5
# bins again for number of chars <=> NBC <=> qualityIndicators.pdfCharCount
NBC = (
	("*", 1999),
	(2000, "*")
	)

# NBC ---------------------------------------------- 5
# bins again for number of chars <=> NBC <=> qualityIndicators.pdfCharCount
NBW = (
	("*", 499),
	(500, "*")
	)