src/Gargantext/Pipeline.hs

   1 {-|
   2 Module      : Gargantext.Pipeline
   3 Description : Server API
   4 Copyright   : (c) CNRS, 2017-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9 -}
  10
  11 {-# OPTIONS_GHC -fno-warn-name-shadowing #-}
  12 {-# LANGUAGE NoImplicitPrelude           #-}
  13
  14 module Gargantext.Pipeline
  15   where
  16
  17 import Data.Text.IO (readFile)
  18
  19 import Control.Arrow ((***))
  20 import Data.Map.Strict (Map)
  21 import qualified Data.Map.Strict as M
  22 import qualified Data.List       as L
  23 import Data.Tuple.Extra (both)
  24 ----------------------------------------------
  25 import Gargantext.Core (Lang(FR))
  26 import Gargantext.Prelude
  27
  28 import Gargantext.Viz.Graph.Index (score, createIndices, toIndex, fromIndex, cooc2mat, mat2map)
  29 import Gargantext.Viz.Graph.Distances.Matrice (conditional', conditional)
  30 import Gargantext.Viz.Graph.Index (Index)
  31 import Gargantext.Text.Metrics.Count (cooc, removeApax)
  32 import Gargantext.Text.Metrics
  33 import Gargantext.Text.Terms (TermType(Multi, Mono), extractTerms)
  34 import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
  35
  36 import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain)
  37
  38
  39 {-
  40   ____                             _            _
  41  / ___| __ _ _ __ __ _  __ _ _ __ | |_ _____  _| |_
  42 | |  _ / _` | '__/ _` |/ _` | '_ \| __/ _ \ \/ / __|
  43 | |_| | (_| | | | (_| | (_| | | | | ||  __/>  <| |_
  44  \____|\__,_|_|  \__, |\__,_|_| |_|\__\___/_/\_\\__|
  45                  |___/
  46
  47 -}
  48
  49 workflow lang path = do
  50   -- Text  <- IO Text <- FilePath
  51   text     <- readFile path
  52   let contexts = splitBy (Sentences 5) text
  53   myterms <- extractTerms Multi lang contexts
  54
  55   -- TODO    filter (\t -> not . elem t stopList) myterms
  56   -- TODO    groupBy (Stem | GroupList)
  57
  58   let myCooc = filterCooc $ removeApax $ cooc myterms
  59   -- Cooc -> Matrix
  60   --let (ti, fi) = createIndices myCooc
  61   -- @np FIXME optimization issue of filterCooc (too much memory consumed)
  62   pure myCooc
  63   -- Matrix -> Clustering
  64 -- pure $ bestpartition False $ map2graph $ toIndex ti myCooc
  65   --partitions <- cLouvain $ toIndex ti $ M.map (\v -> (fromIntegral v) :: Double) myCooc
  66   --pure partitions
  67 ---- | Building : -> Graph -> JSON
  68