]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Pipeline.hs
Merge branch 'pipeline'
[gargantext.git] / src / Gargantext / Pipeline.hs
1 {-|
2 Module : Gargantext.Pipeline
3 Description : Server API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11 {-# OPTIONS_GHC -fno-warn-name-shadowing #-}
12 {-# LANGUAGE NoImplicitPrelude #-}
13
14 module Gargantext.Pipeline
15 where
16
17 import qualified Data.Text as T
18 import Data.Text.IO (readFile)
19
20 import Control.Arrow ((***))
21 import Data.Map.Strict (Map)
22 import qualified Data.Array.Accelerate as A
23 import qualified Data.Map.Strict as M
24 import qualified Data.List as L
25 import Data.Tuple.Extra (both)
26 ----------------------------------------------
27 import Gargantext.Core (Lang(FR))
28 import Gargantext.Core.Types (Label)
29 import Gargantext.Prelude
30 import Prelude (print, seq)
31
32 import Gargantext.Viz.Graph.Index (score, createIndices, toIndex, fromIndex, cooc2mat, map2mat, mat2map)
33 import Gargantext.Viz.Graph.Distances.Matrice (conditional', conditional, distributional)
34 import Gargantext.Viz.Graph.Index (Index)
35 import Gargantext.Viz.Graph (Graph(..), Node(..), Edge(..), Attributes(..), TypeNode(..))
36 import Gargantext.Text.Metrics.Count (cooc)
37 import Gargantext.Text.Metrics
38 import Gargantext.Text.Terms (TermType(Multi, Mono), extractTerms)
39 import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
40
41 import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain, LouvainNode(..))
42
43
44 {-
45 ____ _ _
46 / ___| __ _ _ __ __ _ __ _ _ __ | |_ _____ _| |_
47 | | _ / _` | '__/ _` |/ _` | '_ \| __/ _ \ \/ / __|
48 | |_| | (_| | | | (_| | (_| | | | | || __/> <| |_
49 \____|\__,_|_| \__, |\__,_|_| |_|\__\___/_/\_\\__|
50 |___/
51
52 -}
53
54 workflow lang path = do
55 -- Text <- IO Text <- FilePath
56 text <- readFile path
57
58 let contexts = splitBy (Sentences 5) text
59 -- Context :: Text -> [Text]
60 -- Contexts = Paragraphs n | Sentences n | Chars n
61
62 myterms <- extractTerms (Mono lang) contexts
63 -- myterms # filter (\t -> not . elem t stopList)
64 -- # groupBy (Stem|GroupList)
65 printDebug "myterms" (sum $ map length myterms)
66
67 -- Bulding the map list
68 -- compute copresences of terms
69 -- Cooc = Map (Term, Term) Int
70 let myCooc1 = cooc myterms
71 printDebug "myCooc1" (M.size myCooc1)
72
73 -- Remove Apax: appears one time only => lighting the matrix
74 let myCooc2 = M.filter (>1) myCooc1
75 printDebug "myCooc2" (M.size myCooc2)
76
77 -- Filtering terms with inclusion/Exclusion and Specifity/Genericity scores
78 let myCooc3 = filterCooc ( FilterConfig (MapListSize 20 )
79 (InclusionSize 1000 )
80 (SampleBins 10 )
81 (Clusters 3 )
82 (DefaultValue (-1))
83 ) myCooc2
84 printDebug "myCooc3" $ M.size myCooc3
85
86 -- Cooc -> Matrix
87 let (ti, fi) = createIndices myCooc3
88 printDebug "ti" $ M.size ti
89
90 let myCooc4 = toIndex ti myCooc3
91 printDebug "myCooc4" $ M.size myCooc4
92
93 let matCooc = map2mat (-2) (M.size ti) myCooc4
94 printDebug "matCooc" matCooc
95 pure matCooc
96 -- Matrix -> Clustering
97 --let distanceMat = conditional matCooc
98 -- let distanceMat = distributional matCooc
99 -- printDebug "distanceMat" $ A.arrayShape distanceMat
100 -- printDebug "distanceMat" distanceMat
101 --
102 -- let distanceMap = mat2map distanceMat
103 -- printDebug "distanceMap" $ M.size distanceMap
104 --{-
105 -- let distance = fromIndex fi distanceMap
106 -- printDebug "distance" $ M.size distance
107 ---}
108 -- partitions <- cLouvain distanceMap
109 ------ | Building : -> Graph -> JSON
110 -- printDebug "partitions" $ length partitions
111 -- pure $ data2graph (M.toList ti) myCooc4 distanceMap partitions
112
113
114
115 -----------------------------------------------------------
116 -- distance should not be a map since we just "toList" it (same as cLouvain)
117 data2graph :: [(Label, Int)] -> Map (Int, Int) Int
118 -> Map (Int, Int) Double
119 -> [LouvainNode]
120 -> Graph
121 data2graph labels coocs distance partitions = Graph nodes edges
122 where
123 community_id_by_node_id = M.fromList [ (n, c) | LouvainNode n c <- partitions ]
124 nodes = [ Node { n_size = maybe 0 identity (M.lookup (n,n) coocs)
125 , n_type = Terms -- or Unknown
126 , n_id = cs (show n)
127 , n_label = T.unwords l
128 , n_attributes =
129 Attributes { clust_default = maybe 0 identity
130 (M.lookup n community_id_by_node_id) } }
131 | (l, n) <- labels ]
132 edges = [ Edge { e_source = s
133 , e_target = t
134 , e_weight = w
135 , e_id = i }
136 | (i, ((s,t), w)) <- zip [0..] (M.toList distance) ]
137 -----------------------------------------------------------
138
139 printDebug msg x = putStrLn $ msg <> " " <> show x
140 --printDebug _ _ = pure ()
141
142
143
144