2 Module : Gargantext.Pipeline
3 Description : Server API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 {-# OPTIONS_GHC -fno-warn-name-shadowing #-}
12 {-# LANGUAGE NoImplicitPrelude #-}
14 module Gargantext.Pipeline
17 import qualified Data.Text as T
18 import Data.Text.IO (readFile)
20 import Control.Arrow ((***))
21 import Data.Map.Strict (Map)
22 import qualified Data.Array.Accelerate as A
23 import qualified Data.Map.Strict as M
24 import qualified Data.List as L
25 import Data.Tuple.Extra (both)
26 ----------------------------------------------
27 import Gargantext.Core (Lang(FR))
28 import Gargantext.Core.Types (Label)
29 import Gargantext.Prelude
30 import Prelude (print, seq)
32 import Gargantext.Viz.Graph.Index (score, createIndices, toIndex, fromIndex, cooc2mat, map2mat, mat2map)
33 import Gargantext.Viz.Graph.Distances.Matrice (conditional', conditional)
34 import Gargantext.Viz.Graph.Index (Index)
35 import Gargantext.Viz.Graph (Graph(..), Node(..), Edge(..), Attributes(..), TypeNode(..))
36 import Gargantext.Text.Metrics.Count (cooc, removeApax)
37 import Gargantext.Text.Metrics
38 import Gargantext.Text.Terms (TermType(Multi, Mono), extractTerms)
39 import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
41 import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain, LouvainNode(..))
46 / ___| __ _ _ __ __ _ __ _ _ __ | |_ _____ _| |_
47 | | _ / _` | '__/ _` |/ _` | '_ \| __/ _ \ \/ / __|
48 | |_| | (_| | | | (_| | (_| | | | | || __/> <| |_
49 \____|\__,_|_| \__, |\__,_|_| |_|\__\___/_/\_\\__|
54 -----------------------------------------------------------
55 -- distance should not be a map since we just "toList" it (same as cLouvain)
56 data2graph :: [(Label, Int)] -> Map (Int, Int) Int -> Map (Int, Int) Double -> [LouvainNode] -> Graph
57 data2graph labels coocs distance partitions = Graph nodes edges
59 community_id_by_node_id = M.fromList [ (n, c) | LouvainNode n c <- partitions ]
60 nodes = [ Node { n_size = coocs M.! (n, n) -- TODO lookup with default ?
61 , n_type = Terms -- or Unknown
63 , n_label = T.unwords l
65 -- TODO lookup with default ?
66 Attributes { clust_default = community_id_by_node_id M.! n } }
68 edges = [ Edge { e_source = s
72 | (i, ((s,t), w)) <- zip [0..] (M.toList distance) ]
73 -----------------------------------------------------------
75 -- printDebug msg x = putStrLn $ msg <> " " <> show x
76 printDebug _ _ = pure ()
78 workflow lang path = do
79 -- Text <- IO Text <- FilePath
81 let contexts = splitBy (Sentences 5) text
82 myterms <- extractTerms Mono lang contexts
83 printDebug "myterms" $ sum $ map length myterms
85 -- TODO filter (\t -> not . elem t stopList) myterms
86 -- TODO groupBy (Stem | GroupList)
88 let myCooc1 = cooc myterms
89 printDebug "myCooc1" $ M.size myCooc1
90 let myCooc2 = removeApax myCooc1
91 printDebug "myCooc2" $ M.size myCooc2
92 let myCooc3 = filterCooc myCooc2
93 printDebug "myCooc3" $ M.size myCooc3
95 let (ti, fi) = createIndices myCooc3
96 printDebug "ti" $ M.size ti
97 let myCooc4 = toIndex ti myCooc3
98 printDebug "myCooc4" $ M.size myCooc4
99 let matCooc = map2mat 0 (M.size ti) myCooc4
100 -- Matrix -> Clustering
101 let distanceMat = conditional matCooc
102 printDebug "distanceMat" $ A.arrayShape distanceMat
103 let distanceMap = mat2map distanceMat
104 printDebug "distanceMap" $ M.size distanceMap
106 let distance = fromIndex fi distanceMap
107 printDebug "distance" $ M.size distance
109 partitions <- cLouvain distanceMap
110 ---- | Building : -> Graph -> JSON
111 printDebug "partitions" $ length partitions
112 pure $ data2graph (M.toList ti) myCooc4 distanceMap partitions