2 Module : Gargantext.Pipeline
3 Description : Server API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 {-# OPTIONS_GHC -fno-warn-name-shadowing #-}
12 {-# LANGUAGE NoImplicitPrelude #-}
14 module Gargantext.Pipeline
17 import qualified Data.Text as T
18 import Data.Text.IO (readFile)
20 import Control.Arrow ((***))
21 import Data.Map.Strict (Map)
22 import qualified Data.Array.Accelerate as A
23 import qualified Data.Map.Strict as M
24 import qualified Data.List as L
25 import Data.Tuple.Extra (both)
26 ----------------------------------------------
27 import Gargantext.Core (Lang(FR))
28 import Gargantext.Core.Types (Label)
29 import Gargantext.Prelude
30 import Prelude (print, seq)
32 import Gargantext.Viz.Graph.Index (score, createIndices, toIndex, fromIndex, cooc2mat, map2mat, mat2map)
33 import Gargantext.Viz.Graph.Distances.Matrice (conditional', conditional, distributional)
34 import Gargantext.Viz.Graph.Index (Index)
35 import Gargantext.Viz.Graph (Graph(..), Node(..), Edge(..), Attributes(..), TypeNode(..))
36 import Gargantext.Text.Metrics.Count (cooc)
37 import Gargantext.Text.Metrics
38 import Gargantext.Text.Terms (TermType(Multi, Mono), extractTerms)
39 import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
41 import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain, LouvainNode(..))
46 / ___| __ _ _ __ __ _ __ _ _ __ | |_ _____ _| |_
47 | | _ / _` | '__/ _` |/ _` | '_ \| __/ _ \ \/ / __|
48 | |_| | (_| | | | (_| | (_| | | | | || __/> <| |_
49 \____|\__,_|_| \__, |\__,_|_| |_|\__\___/_/\_\\__|
54 workflow lang path = do
55 -- Text <- IO Text <- FilePath
58 -- context :: Text -> [Text]
59 let contexts = splitBy (Sentences 5) text
61 myterms <- extractTerms Mono lang contexts
62 -- myterms <- extractTerms (Mono lang) contexts # filter (\t -> not . elem t stopList)
63 -- # groupBy (Stem|GroupList)
64 printDebug "myterms" (sum $ map length myterms)
66 -- Bulding the map list
67 let myCooc1 = cooc myterms
68 printDebug "myCooc1" (M.size myCooc1)
70 -- Remove Apax: appears one time only => lighting the matrix
71 let myCooc2 = M.filter (>1) myCooc1
72 printDebug "myCooc2" (M.size myCooc2)
74 -- Filtering terms with inclusion/Exclusion and Specifity/Genericity scores
75 let myCooc3 = filterCooc ( FilterConfig (MapListSize 20 )
81 printDebug "myCooc3" $ M.size myCooc3
84 let (ti, fi) = createIndices myCooc3
85 printDebug "ti" $ M.size ti
87 let myCooc4 = toIndex ti myCooc3
88 printDebug "myCooc4" $ M.size myCooc4
90 let matCooc = map2mat (-2) (M.size ti) myCooc4
91 printDebug "matCooc" matCooc
93 -- Matrix -> Clustering
94 --let distanceMat = conditional matCooc
95 -- let distanceMat = distributional matCooc
96 -- printDebug "distanceMat" $ A.arrayShape distanceMat
97 -- printDebug "distanceMat" distanceMat
99 -- let distanceMap = mat2map distanceMat
100 -- printDebug "distanceMap" $ M.size distanceMap
102 -- let distance = fromIndex fi distanceMap
103 -- printDebug "distance" $ M.size distance
105 -- partitions <- cLouvain distanceMap
106 ------ | Building : -> Graph -> JSON
107 -- printDebug "partitions" $ length partitions
108 -- pure $ data2graph (M.toList ti) myCooc4 distanceMap partitions
112 -----------------------------------------------------------
113 -- distance should not be a map since we just "toList" it (same as cLouvain)
114 data2graph :: [(Label, Int)] -> Map (Int, Int) Int
115 -> Map (Int, Int) Double
118 data2graph labels coocs distance partitions = Graph nodes edges
120 community_id_by_node_id = M.fromList [ (n, c) | LouvainNode n c <- partitions ]
121 nodes = [ Node { n_size = maybe 0 identity (M.lookup (n,n) coocs)
122 , n_type = Terms -- or Unknown
124 , n_label = T.unwords l
126 Attributes { clust_default = maybe 0 identity
127 (M.lookup n community_id_by_node_id) } }
129 edges = [ Edge { e_source = s
133 | (i, ((s,t), w)) <- zip [0..] (M.toList distance) ]
134 -----------------------------------------------------------
136 printDebug msg x = putStrLn $ msg <> " " <> show x
137 --printDebug _ _ = pure ()