tdm-utils/index.js at master - kieffer/tdm-utils

Fork: 0
kieffer / tdm-utils
Find file
Newer
Older
tdm-utils / index.js
kieffer on 20 Apr 2017 7 KB Update for node-ez-istex
Raw Blame History
/* global module */
/* jslint node: true */
/* jslint indent: 2 */
'use strict';

/* Module Require */
var fs = require('fs'),
  path = require('path'),
  teeft = require('rd-teeft'),
  request = require('request'),
  mkdirp = require('mkdirp'),
  async = require('async');

/* Libs */
var jLouvain = require('./lib/jLouvain.js');

/**
 * Objet regroupant les fonctionnalités liées aux tdm
 */
var utils = {};

/**
 * Classe Représentant un corpus
 */
class Corpus {
  /*
   * Constructeur
   * @param {Object} options Objet contenant les différents paramètres possible de l'objet, à savoir :
   *   - {String} name : Nom du corpus
   *   - {String} file : Nom du fichier contenant les Ids ISTEX
   *   - {String} in : Répertoire d'entrée (contenant les fichiers téléchargés à partir du fichier .txt)
   *   - {String} out : Répertoire de sortie prévu
   * @return {Corpus} Return un objet de type Corpus
   */
  constructor(options) {
    if (!options) return;
    this.name = (typeof options.name === 'string') ? options.name :  "";
    this.file = (typeof options.file === 'string') ? options.file :  "";
    this.in = (typeof options.in === 'string') ? options.in : "";
    this.out = (typeof options.out === 'string') ? options.out : "";
  }
}

/**
 * Objet permettant de traiter un ensemble de Corpus
 */
utils.corpusManager = {};
/**
 * Initialise les chemins d'un corpus de TDM dans EzMaster
 * @param {String} file Chemin vers le fichier de corpus (un fichiers .corpus contenant sa description)
 * @param {String} outputDir Chemin vers le répertoire qui contiendra l'architecture créée
 * @param {function} cb Callback appelée à la fin du traitement, avec comme paramètre disponible :
 *  - {Error} err Erreurs de Lecture/Écriture
 *  - {Corpus} res Informations sur le corpus
 * @return {undefined} Return undefined
 * Exemple d'architecture crée pour un corpus x : 
 * ./
 *  x.corpus
 *  in/
 *     x/
 *  out/
 *     x/
 */
utils.corpusManager.init = function(file, outputDir, cb) {
  var filename = path.basename(file); // [PROVISOIRE] Nom du répertoire = Nom du fichier
  return fs.stat(file, function(err, stats) {
    if (err) return cb(err); // Erreur lors du stats, on la remonte
    if (!stats.isFile()) return cb(); // Ce n'est pas un fichier, on l'ignore
    return fs.readFile(file, 'utf-8', function(err, res) {
      if (err) return cb(err); // Erreur lors du readFile, on la remonte
      var outputFile = path.join(outputDir, filename); // Erreur lors du readFile, on la remonte
      return fs.writeFile(outputFile, res, 'utf-8', function(err) {
        if (err) return cb(err); // Erreur lors du writeFile, on la remonte
        var corpus = new Corpus({
          'name': filename,
          'file': outputFile
        });
        return async.each(['in', 'out'], function(subDirectory, next) { // Création d'un répertoire dans in et out
          var dir = path.join(outputDir, subDirectory, filename); // Chemin complet du répertoire
          mkdirp(dir,
            function(err) {
              if (err) return next(err); // Erreur lors du mkdirp, on la remonte
              corpus[subDirectory] = dir;
              return next();
            });
        }, function(err) {
          if (err) return cb(err); // Renvois de toutes les erreurs remontée lors de la création des répertoires
          return cb(null, corpus); // // Corpus correctement traité
        });
      });
    });
  });
}

/**
 * Indexe tous les fichiers d'un corpus
 * @param {String} inputDir Répertoire d'entré contenant des fichiers .txt
 * @param {String} outputFile Fichier de sortie
 * @param {function} cb Callback appelée à la fin du traitement, avec comme paramètre disponible :
 *  - {Error} err Erreurs lors du traitement
 *  - {Array} res Liste des Indexations
 * @return {undefined} Return undefined
 */
utils.corpusManager.indexAll = function(inputDir, outputFile, cb) {
  var result = []; // List of indexations
  fs.readdir(inputDir, function(err, filenames) {
    if (err) return cb(err); // I/O Errors
    async.each(filenames, function(filename, callback) {
        var filePath = path.join(inputDir, filename);
        fs.readFile(filePath, 'utf-8', function(err, res) {
          if (err) return callback(err); // I/O Errors
          var docId = path.basename(filename, ('.txt'));
          result.push({
            'id': docId,
            'keywords': teeft.index(res).keywords
          });
          callback();
        });
      },
      function(err) {
        if (err) return cb(err); // I/O Errors
        // write data
        fs.writeFile(outputFile, JSON.stringify(result), 'utf-8', function(err, res) {
          if (err) return cb(err);
          return cb(null, result);
        });
      });
  });
};

utils.graphs = {};

/**
 * Génère un graph de Documents (Visualisation des lien entre chaque documents)
 * @param {Array} indexations Liste d'indexation
 * @param {Object} options Données optionnelles
 * @param {function} cb Callback appelée à la fin du traitement, avec comme paramètre disponible :
 *  - {Error} err Erreurs lors du traitement
 *  - {Object} res Données affichables avec d3.js
 * @return {undefined} Return undefined
 */
utils.graphs.docToDoc = function(indexations, options, cb) {
  if (!options) options = {};
  var terms = {}, // Each key is a term, his value is the list of documents containing it
    docIds = [], // List of document Ids
    result = {
      'nodes': [],
      'links': []
    },
    edges = [], //  [{'source': '', 'target': '', 'weight': 0}, ...]
    nodes = [], // ['id', ...]
    matrix = {}, // Matrix of "doc-doc" links (sparse matrix)
    output = options.output || './cache/docToDoc.json',
    minLinkValue = options.minLinkValue || 0;
  // Construction of terms Object
  for (var i = 0; i < indexations.length; i++) {
    var id = indexations[i].id,
      keywords = indexations[i].keywords;
    for (var j = 0; j < keywords.length; j++) {
      var term = keywords[j].term;
      if (!terms[term]) terms[term] = [];
      terms[term].push(i);
    }
  }
  // Construction of matrix Object
  for (var key in terms) {
    // Fill it with values
    for (var i = 0; i < terms[key].length - 1; i++) {
      var idDoc1 = terms[key][i];
      for (var j = i + 1; j < terms[key].length; j++) {
        var idDoc2 = terms[key][j],
          ids = [idDoc1, idDoc2],
          id = {
            'min': Math.min(ids[0], ids[1]),
            'max': Math.max(ids[0], ids[1])
          };
        // Only half of it will be fill!
        if (!matrix[id.min + ',' + id.max]) {
          matrix[id.min + ',' + id.max] = 0;
        }
        matrix[id.min + ',' + id.max]++;
      }
    }
  }
  // Construction of matrix of links doc-doc
  for (var key in matrix) {
    var ids = key.split(',');
    if (matrix[key] >= minLinkValue) {
      edges.push({
        'source': ids[0],
        'target': ids[1],
        'weight': matrix[key]
      });
      result.links.push({
        'source': ids[0],
        'target': ids[1],
        'value': matrix[key]
      });
    }
  }
  // Construction of Nodes object
  for (var i = 0; i < indexations.length; i++) {
    var id = indexations[i].id,
      keywords = indexations[i].keywords;
    var max = (keywords.length < 5) ? keywords.length : 5;
    nodes.push(i);
    result.nodes.push({
      'id': i,
      'istex': id,
      'value': keywords.slice(0, max).map(function(elem, i) {
        return elem.term
      }).join('; '),
      'group': 0
    });
  }
  // Create the "community"
  var community = jLouvain().nodes(nodes).edges(edges),
    res = community();
  // Affect community for each node
  for (var key in res) {
    result.nodes[key].group = res[key];
  }
  // write data
  fs.writeFile(output, JSON.stringify(result), 'utf-8', function(err, res) {
    if (err) return cb(err);
    return cb(null, result);
  });
};

module.exports = utils;