2 Module : Gargantext.Pipeline
3 Description : Server API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 {-# OPTIONS_GHC -fno-warn-name-shadowing #-}
12 {-# LANGUAGE NoImplicitPrelude #-}
14 module Gargantext.Pipeline
17 import qualified Data.Text as T
18 import Data.Text.IO (readFile)
20 import Control.Arrow ((***))
21 import Data.Map.Strict (Map)
22 import qualified Data.Array.Accelerate as A
23 import qualified Data.Map.Strict as M
24 import qualified Data.List as L
25 import Data.Tuple.Extra (both)
26 ----------------------------------------------
27 import Gargantext.Core (Lang(FR))
28 import Gargantext.Core.Types (Label)
29 import Gargantext.Prelude
30 import Prelude (print, seq)
32 import Gargantext.Viz.Graph.Index (score, createIndices, toIndex, fromIndex, cooc2mat, map2mat, mat2map)
33 import Gargantext.Viz.Graph.Distances.Matrice (conditional', conditional, distributional)
34 import Gargantext.Viz.Graph.Index (Index)
35 import Gargantext.Viz.Graph (Graph(..), Node(..), Edge(..), Attributes(..), TypeNode(..))
36 import Gargantext.Text.Metrics.Count (cooc)
37 import Gargantext.Text.Metrics
38 import Gargantext.Text.Terms (TermType(Multi, Mono), extractTerms)
39 import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
41 import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain, LouvainNode(..))
46 / ___| __ _ _ __ __ _ __ _ _ __ | |_ _____ _| |_
47 | | _ / _` | '__/ _` |/ _` | '_ \| __/ _ \ \/ / __|
48 | |_| | (_| | | | (_| | (_| | | | | || __/> <| |_
49 \____|\__,_|_| \__, |\__,_|_| |_|\__\___/_/\_\\__|
54 workflow lang path = do
55 -- Text <- IO Text <- FilePath
58 let contexts = splitBy (Sentences 5) text
59 -- Context :: Text -> [Text]
60 -- Contexts = Paragraphs n | Sentences n | Chars n
62 myterms <- extractTerms (Mono lang) contexts
63 -- myterms # filter (\t -> not . elem t stopList)
64 -- # groupBy (Stem|GroupList)
65 printDebug "myterms" (sum $ map length myterms)
67 -- Bulding the map list
68 -- compute copresences of terms
69 -- Cooc = Map (Term, Term) Int
70 let myCooc1 = cooc myterms
71 printDebug "myCooc1" (M.size myCooc1)
73 -- Remove Apax: appears one time only => lighting the matrix
74 let myCooc2 = M.filter (>1) myCooc1
75 printDebug "myCooc2" (M.size myCooc2)
77 -- Filtering terms with inclusion/Exclusion and Specifity/Genericity scores
78 let myCooc3 = filterCooc ( FilterConfig (MapListSize 20 )
84 printDebug "myCooc3" $ M.size myCooc3
87 let (ti, fi) = createIndices myCooc3
88 printDebug "ti" $ M.size ti
90 let myCooc4 = toIndex ti myCooc3
91 printDebug "myCooc4" $ M.size myCooc4
93 let matCooc = map2mat (-2) (M.size ti) myCooc4
94 printDebug "matCooc" matCooc
96 -- Matrix -> Clustering
97 --let distanceMat = conditional matCooc
98 -- let distanceMat = distributional matCooc
99 -- printDebug "distanceMat" $ A.arrayShape distanceMat
100 -- printDebug "distanceMat" distanceMat
102 -- let distanceMap = mat2map distanceMat
103 -- printDebug "distanceMap" $ M.size distanceMap
105 -- let distance = fromIndex fi distanceMap
106 -- printDebug "distance" $ M.size distance
108 -- partitions <- cLouvain distanceMap
109 ------ | Building : -> Graph -> JSON
110 -- printDebug "partitions" $ length partitions
111 -- pure $ data2graph (M.toList ti) myCooc4 distanceMap partitions
115 -----------------------------------------------------------
116 -- distance should not be a map since we just "toList" it (same as cLouvain)
117 data2graph :: [(Label, Int)] -> Map (Int, Int) Int
118 -> Map (Int, Int) Double
121 data2graph labels coocs distance partitions = Graph nodes edges
123 community_id_by_node_id = M.fromList [ (n, c) | LouvainNode n c <- partitions ]
124 nodes = [ Node { n_size = maybe 0 identity (M.lookup (n,n) coocs)
125 , n_type = Terms -- or Unknown
127 , n_label = T.unwords l
129 Attributes { clust_default = maybe 0 identity
130 (M.lookup n community_id_by_node_id) } }
132 edges = [ Edge { e_source = s
136 | (i, ((s,t), w)) <- zip [0..] (M.toList distance) ]
137 -----------------------------------------------------------
139 printDebug msg x = putStrLn $ msg <> " " <> show x
140 --printDebug _ _ = pure ()