]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Pipeline.hs
[WORKFLOW] clean, issue in map2mat: diagonal == 0.
[gargantext.git] / src / Gargantext / Pipeline.hs
1 {-|
2 Module : Gargantext.Pipeline
3 Description : Server API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11 {-# OPTIONS_GHC -fno-warn-name-shadowing #-}
12 {-# LANGUAGE NoImplicitPrelude #-}
13
14 module Gargantext.Pipeline
15 where
16
17 import qualified Data.Text as T
18 import Data.Text.IO (readFile)
19
20 import Control.Arrow ((***))
21 import Data.Map.Strict (Map)
22 import qualified Data.Array.Accelerate as A
23 import qualified Data.Map.Strict as M
24 import qualified Data.List as L
25 import Data.Tuple.Extra (both)
26 ----------------------------------------------
27 import Gargantext.Core (Lang(FR))
28 import Gargantext.Core.Types (Label)
29 import Gargantext.Prelude
30 import Prelude (print, seq)
31
32 import Gargantext.Viz.Graph.Index (score, createIndices, toIndex, fromIndex, cooc2mat, map2mat, mat2map)
33 import Gargantext.Viz.Graph.Distances.Matrice (conditional', conditional, distributional)
34 import Gargantext.Viz.Graph.Index (Index)
35 import Gargantext.Viz.Graph (Graph(..), Node(..), Edge(..), Attributes(..), TypeNode(..))
36 import Gargantext.Text.Metrics.Count (cooc)
37 import Gargantext.Text.Metrics
38 import Gargantext.Text.Terms (TermType(Multi, Mono), extractTerms)
39 import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
40
41 import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain, LouvainNode(..))
42
43
44 {-
45 ____ _ _
46 / ___| __ _ _ __ __ _ __ _ _ __ | |_ _____ _| |_
47 | | _ / _` | '__/ _` |/ _` | '_ \| __/ _ \ \/ / __|
48 | |_| | (_| | | | (_| | (_| | | | | || __/> <| |_
49 \____|\__,_|_| \__, |\__,_|_| |_|\__\___/_/\_\\__|
50 |___/
51
52 -}
53
54 workflow lang path = do
55 -- Text <- IO Text <- FilePath
56 text <- readFile path
57
58 -- context :: Text -> [Text]
59 let contexts = splitBy (Sentences 5) text
60
61 myterms <- extractTerms Mono lang contexts
62 -- myterms <- extractTerms (Mono lang) contexts # filter (\t -> not . elem t stopList)
63 -- # groupBy (Stem|GroupList)
64 printDebug "myterms" (sum $ map length myterms)
65
66 -- Bulding the map list
67 let myCooc1 = cooc myterms
68 printDebug "myCooc1" (M.size myCooc1)
69
70 -- Remove Apax: appears one time only => lighting the matrix
71 let myCooc2 = M.filter (>1) myCooc1
72 printDebug "myCooc2" (M.size myCooc2)
73
74 -- Filtering terms with inclusion/Exclusion and Specifity/Genericity scores
75 let myCooc3 = filterCooc ( FilterConfig (MapListSize 20 )
76 (InclusionSize 1000 )
77 (SampleBins 10 )
78 (Clusters 3 )
79 (DefaultValue (-1))
80 ) myCooc2
81 printDebug "myCooc3" $ M.size myCooc3
82
83 -- Cooc -> Matrix
84 let (ti, fi) = createIndices myCooc3
85 printDebug "ti" $ M.size ti
86
87 let myCooc4 = toIndex ti myCooc3
88 printDebug "myCooc4" $ M.size myCooc4
89
90 let matCooc = map2mat (-2) (M.size ti) myCooc4
91 printDebug "matCooc" matCooc
92 pure matCooc
93 -- Matrix -> Clustering
94 --let distanceMat = conditional matCooc
95 -- let distanceMat = distributional matCooc
96 -- printDebug "distanceMat" $ A.arrayShape distanceMat
97 -- printDebug "distanceMat" distanceMat
98 --
99 -- let distanceMap = mat2map distanceMat
100 -- printDebug "distanceMap" $ M.size distanceMap
101 --{-
102 -- let distance = fromIndex fi distanceMap
103 -- printDebug "distance" $ M.size distance
104 ---}
105 -- partitions <- cLouvain distanceMap
106 ------ | Building : -> Graph -> JSON
107 -- printDebug "partitions" $ length partitions
108 -- pure $ data2graph (M.toList ti) myCooc4 distanceMap partitions
109
110
111
112 -----------------------------------------------------------
113 -- distance should not be a map since we just "toList" it (same as cLouvain)
114 data2graph :: [(Label, Int)] -> Map (Int, Int) Int
115 -> Map (Int, Int) Double
116 -> [LouvainNode]
117 -> Graph
118 data2graph labels coocs distance partitions = Graph nodes edges
119 where
120 community_id_by_node_id = M.fromList [ (n, c) | LouvainNode n c <- partitions ]
121 nodes = [ Node { n_size = maybe 0 identity (M.lookup (n,n) coocs)
122 , n_type = Terms -- or Unknown
123 , n_id = cs (show n)
124 , n_label = T.unwords l
125 , n_attributes =
126 Attributes { clust_default = maybe 0 identity
127 (M.lookup n community_id_by_node_id) } }
128 | (l, n) <- labels ]
129 edges = [ Edge { e_source = s
130 , e_target = t
131 , e_weight = w
132 , e_id = i }
133 | (i, ((s,t), w)) <- zip [0..] (M.toList distance) ]
134 -----------------------------------------------------------
135
136 printDebug msg x = putStrLn $ msg <> " " <> show x
137 --printDebug _ _ = pure ()
138
139
140
141