Newer
Older
m2m / m2m.pl
@besagni besagni on 19 Jan 2018 8 KB Version 2.3.0 (Ajout du champ ARK)
#!/usr/bin/perl


# Déclaration des pragmas
use strict;
use utf8;
use open qw/:std :utf8/;

# Appel des modules externes de base
use Encode qw(decode_utf8 encode_utf8 is_utf8);
use Getopt::Long;

# Appel des modules spécifiques à l'application
use URI::Encode qw(uri_encode uri_decode);
use LWP::Simple;
use JSON;
use Text::Unidecode;
## use XML::Twig;

my ($programme) = $0 =~ m|^(?:.*/)?(.+)|;

my $version     = "2.3.0";
my $dateModif   = "19 Janvier 2018";

# Variables
my $destination = "";
my $help        = 0;
my $mail        = "";
my $numero      = 0;
my $quiet       = 0;
my $requete     = "";

eval	{
	$SIG{__WARN__} = sub {usage(1);};
	GetOptions(
		"destination=s" => \$destination,
#		"help"          => \$help,
		"mail=s"        => \$mail,
#		"numero=i"      => \$numero,
		"quiet"         => \$quiet,
		"requete=s"     => \$requete,
		);
	};
$SIG{__WARN__} = sub {warn $_[0];};

usage(0) if $help;
usage(2) if not $destination;

my $sender = "";
if ( $mail ) {
	if ( $mail !~ /^\w+(\.\w+)*\@\w+(\.\w+)+\z/ ) {
		print STDERR "Erreur : adresse e-mail \"$mail\" incorrect\n";
		exit 3;
		}
	$mail =~ s/(\W)/\\$1/go;
	my $hostname = `hostname`;
	my $domain   = `hostname -d`;
	$sender = "$ENV{'USER'}\@$hostname.$domain";
	$sender =~ s/[\n\r]+//go;
	}

if ( $destination eq "-" ) {
	open(OUT, ">&STDOUT") or die "$!,";
	}
elsif ( $destination =~ /\.gz\z/ ) {
	open(OUT, "| gzip -c -9 > $destination") or die "$!,";
	}
elsif ( $destination =~ /\.bz2\z/ ) {
	open(OUT, "| bzip2 -c -9 > $destination") or die "$!,";
	}
else	{
	$destination =~ s/\W*\z/.gz/;
	open(OUT, "| gzip -c -9 > $destination") or die "$!,";
	}

# Paramètres de l'API ISTEX
my $base = "https://api.istex.fr";
my $url  = "$base/document/?q=";
my $req  = "*";
my $out  = "output=*";
my $size = 300;

if ( $requete ) {
	$req = decode_utf8($requete, Encode::FB_QUIET);
	if ( $requete ) {
		die "Erreur : La requête n'est pas en UTF-8\n";
		}
	$req = propre($req);
	}

#if ( $req =~ / /o ) {
#	$req =~ s/ /+/go;
#	}
my $uri = "$url$req&$out&size=$size&scroll=167s";

# Variables concernant les documents
my $corpus  = "";
my $etape   = 1000000;
my $id      = "";
my $num     = $numero;
my $suivant = "";
my $total   = 0;

# Première itération
my $heure = date();;
if ( not $quiet ) {
	print STDERR "Processus n° $$\n\n";
	print STDERR "Démarrage le $heure\n";
	}
message("Processus n° $$\nDémarrage le $heure");

my $nb = 10;
while( $nb ) {
	print STDERR "URI : \"$uri\"\n" if not $quiet;
	my $json = get("$uri");
	my $perl = undef;
	if ( defined $json ) {
		eval	{
			$perl = decode_json $json;
			};
		if ( $@ ) {
			print STDERR "[" . pretty($num) . "] Erreur JSON => Perl : $@ \n";
			exit 6;
#			print STDERR "[", pretty($num), "] Erreur JSON => Perl : $@ \n";
#			sleep 30;
#			next;
			}
		my %top = %{$perl};
		$total = $top{'total'};
		if ( $total > 0 ) {
			print STDERR "Total : $total\n" if not $quiet;
			if ( $top{'nextScrollURI'} ) {
				$suivant = $top{'nextScrollURI'};
				}
			else	{
				$suivant = "";
				}
			my @hits = @{$top{'hits'}};
			foreach my $hit (@hits) {
				$num ++;
				traite($hit);
				}
			$nb = 0;
			}
		else	{
			print STDERR "Aucun document pour la requête \'$req\'\n";
			exit 4;
			}
		}
	else	{
		$heure = date();;
		if ( $nb ) {
			$nb --;
			print STDERR "Pause (10 sec.) à la notice ", pretty($num), " le $heure\n";
			sleep 10;
			}
		else	{
			print STDERR "Aucune réponse du serveur \"$base\" le $heure\n";
			exit 5;
			}
		}
	}

# Itérations suivantes
$nb = 10;
while ( $suivant ) {
	my $json = get("$suivant");
	my $perl = undef;
	if ( defined $json ) {
		$nb = 10;
		eval	{
			$perl = decode_json $json;
			};
		if ( $@ ) {
			print STDERR "[", pretty($num), "] Erreur JSON => Perl : $@ \n";
			sleep 30;
			next;
			}
		$perl = decode_json $json;
		my %top = %{$perl};
		if ( $top{'nextScrollURI'} ) {
			$suivant = $top{'nextScrollURI'};
			}
		else	{
			$suivant =  "";
			}
		my @hits = @{$top{'hits'}};
		foreach my $hit (@hits) {
			$num ++;
			traite($hit);
			}
		}
	else	{
		$heure = date();;
		if ( $nb ) {
			$nb --;
			print STDERR "Pause (10 sec.) à la notice ", pretty($num), " le $heure\n";
			sleep 10;
			}
		else	{
			print STDERR "Aucune réponse du serveur \"$base\" le $heure\n";
			sleep 20;
			$heure = date();;
			print STDERR "Reprise le $heure\n";
			$nb = 10;
			}
		}
	}

close OUT;

$heure = date();;
if ( not $quiet ) {
	print STDERR "Arrêt le $heure\n";
	}
if ( $num > 1 ) {
	message("Arrêt le $heure\n$num/$total enregistrements traités");
	}
else	{
	message("Arrêt le $heure\n$num/$total enregistrement traité");
	}


exit 0;


sub usage
{
my $code = shift;

print STDERR "Usage : $programme -d destination [ -r 'requête' ] [ -m adresse_mail ] [ -q ]\n";

exit $code;
}

sub traite
{
my $hit = shift;

my %hit = %{$hit};
$id  = $hit{'id'};
$corpus = defined $hit{'corpusName'} ? $hit{'corpusName'} : "Inconnu";
my $ark    = '?';
my $lien   = "";
my $type   = "";
my $langue = "";
my $resume = 0;
my $pdf    = "?";
my $titre  = "";
my $issn   = "";
my $eissn  = "";
my $isbn   = "";
my $volume = "";
my $issue  = "";
my $date   = "";
my @catWoS = ();
my @catSM  = ();
my @catPF  = ();	# Pascal & Francis
my @sujets = ();
my @types  = ();
my $genre  = "";
if ( defined $hit{'arkIstex'} ) {
	$ark = substr($hit{'arkIstex'}, 4);
	}
if ( defined $hit{'genre'} ) {
	$genre = join(", ", @{$hit{'genre'}});
	}
if ( defined $hit{'publicationDate'} ) {
	$date = $hit{'publicationDate'};
	}
if ( defined $hit{'copyrightDate'} ) {
	$date = $hit{'copyrightDate'};
	}
if ( defined $hit{'language'} ) {
	$langue = join(", ", @{$hit{'language'}});
	}
if ( defined $hit{'abstract'} ) {
	$resume = length($hit{'abstract'});
	}
if ( defined $hit{'qualityIndicators'} ) {
	my %indicateurs = %{$hit{'qualityIndicators'}};
	if ( defined $indicateurs{'pdfVersion'} ) {
		$pdf = $indicateurs{'pdfVersion'};
		}
	}
if ( defined $hit{'categories'} ) {
	my %categories = %{$hit{'categories'}};
	if ( defined $categories{'wos'} ) {
		@catWoS = @{$categories{'wos'}};
		}
	if ( defined $categories{'scienceMetrix'} ) {
		@catSM = @{$categories{'scienceMetrix'}};
		}
	if ( defined $categories{'inist'} ) {
		@catPF = @{$categories{'inist'}};
		}
	}
if ( defined $hit{'subject'} ) {
	foreach my $subject (@{$hit{'subject'}}) {
		my $valeur = $subject->{'value'};
		if ( $valeur =~ /[\n\t]+/o ) {
			$valeur =~ s/[\n\t]+//go;
			print STDERR "Valeur de descripteur incorrect pour la notice n° $num [$id]\n";
			}
		push(@sujets, $valeur);
		}
	}
if ( defined $hit{'host'} ) {
	my %host = %{$hit{'host'}};
	if ( defined $host{'issn'} ) {
		$issn = join(", ", @{$host{'issn'}});
		}
	if ( defined $host{'eissn'} ) {
		$eissn = join(", ", @{$host{'eissn'}});
		}
	if ( defined $host{'isbn'} ) {
		$isbn = join(", ", @{$host{'isbn'}});
		}
	if ( defined $host{'title'} ) {
		$titre = $host{'title'};
		}
	if ( defined $host{'volume'} ) {
		$volume = $host{'volume'};
		}
	if ( defined $host{'issue'} ) {
		$issue = $host{'issue'};
		}
	if ( defined $host{'genre'} ) {
		if ( $genre ) {
			$genre = " ; $genre";
			}
		$genre = join(", ", @{$host{'genre'}}) . "$genre";
		}
	}
if ( defined $hit{'enrichments'} ) {
	my %enrichment = %{$hit{'enrichments'}};
	push(@types, sort keys %enrichment);
	}
print OUT pretty($num) , "\t$corpus\t$ark\t$id\t$genre\t$issn\t$eissn\t$isbn";
print OUT "\t$titre\t$date\t$volume\t$issue\t$langue\t$resume car.\t$pdf\t";
print OUT join(" ; ", @types), "\t", join(" ; ", @catWoS), "\t";
print OUT join(" ; ", @catSM), "\t", join(" ; ", @catPF), "\t";
print OUT join(" ; ", @sujets), "\n";

if ( $num % $etape == 0 ) {
	my $heure = date();;
#	my $tmp = $num;
#	1 while $tmp =~ s/(\d+)(\d\d\d)\b/$1.$2/o;
	my $nombre = pretty($num);
	print STDERR "Barre des $nombre notices atteinte le $heure\n" if not $quiet;
	message("Barre des $nombre notices atteinte le $heure");
	}
}

sub propre
{
my $chaine = shift;

# Vérification de jeu de caractères (doit être UTF-8)
if ( is_utf8($chaine, Encode::FB_QUIET) ) {

	# URLencodage
	$chaine = uri_encode($chaine);
	$chaine =~ s/&/%26/go;

	return $chaine;
	}
else	{
	die "la chaîne de caractères \"$chaine\" n'est pas en UTF-8";
	}
}

sub pretty
{
my $valeur = shift;
1 while $valeur =~ s/(\d+)(\d\d\d)\b/$1.$2/o; 
return $valeur;
}

sub message
{
my $texte = shift;

return if not $mail;

open(MAIL, "| /usr/sbin/sendmail $mail") or die "$!,";

binmode(MAIL, ":utf8");
my $ancien = select MAIL;
$| = 1;
select $ancien;

print MAIL "Mime-Version: 1.0\nSubject: Commentaires sur l'extraction de données ISTEX\n";
print MAIL "From: \"M2M\" <$sender>\n";
print MAIL "Content-Type: text/plain; charset=utf-8\n";
print MAIL "Content-Transfer-Encoding: 8bit\n\n";
print MAIL "$texte\n\n.\n";
close MAIL;
}

sub date
{
my @time = localtime();

my $jour = (qw(Dimanche Lundi Mardi Mercredi Jeudi Vendredi Samedi))[$time[6]];
my $mois = (qw(Janvier Février Mars Avril Mai juin Juillet Août Septembre Octobre Novembre Décembre))[$time[4]];
my $annee = $time[5] + 1900;

my $date = "$jour $time[3] $mois $annee ";
$date .= sprintf("%02d:%02d:%02d", $time[2], $time[1], $time[0]);

return $date;
}