]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/TextFlow.hs
[BASQHL] first basic function and todo list.
[gargantext.git] / src / Gargantext / TextFlow.hs
1 {-|
2 Module : Gargantext.TextFlow
3 Description : Server API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 From text to viz, all the flow of texts in Gargantext.
11
12 -}
13
14 {-# OPTIONS_GHC -fno-warn-name-shadowing #-}
15 {-# LANGUAGE NoImplicitPrelude #-}
16
17 module Gargantext.TextFlow
18 where
19
20 import qualified Data.Text as T
21 import Data.Text.IO (readFile)
22
23 import Control.Arrow ((***))
24 import Data.Map.Strict (Map)
25 import qualified Data.Array.Accelerate as A
26 import qualified Data.Map.Strict as M
27 import qualified Data.List as L
28 import Data.Tuple.Extra (both)
29 ----------------------------------------------
30 import Gargantext.Core (Lang(FR))
31 import Gargantext.Core.Types (Label)
32 import Gargantext.Prelude
33 import Prelude (print, seq)
34
35 import Gargantext.Viz.Graph.Index (score, createIndices, toIndex, fromIndex, cooc2mat, map2mat, mat2map)
36 import Gargantext.Viz.Graph.Distances.Matrice (conditional', conditional, distributional)
37 import Gargantext.Viz.Graph.Index (Index)
38 import Gargantext.Viz.Graph (Graph(..), Node(..), Edge(..), Attributes(..), TypeNode(..))
39 import Gargantext.Text.Metrics.Count (cooc)
40 import Gargantext.Text.Metrics
41 import Gargantext.Text.Terms (TermType(Multi, Mono), extractTerms)
42 import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
43
44 import Gargantext.Text.Parsers.CSV
45
46 import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain, LouvainNode(..))
47
48
49 {-
50 ____ _ _
51 / ___| __ _ _ __ __ _ __ _ _ __ | |_ _____ _| |_
52 | | _ / _` | '__/ _` |/ _` | '_ \| __/ _ \ \/ / __|
53 | |_| | (_| | | | (_| | (_| | | | | || __/> <| |_
54 \____|\__,_|_| \__, |\__,_|_| |_|\__\___/_/\_\\__|
55 |___/
56
57 -}
58
59 data TextFlow = CSV | FullText
60
61 -- workflow :: Lang (EN|FR) -> FilePath -> Graph
62 textflow termsLang workType path = do
63 -- Text <- IO Text <- FilePath
64 contexts <- case workType of
65 FullText -> splitBy (Sentences 5) <$> readFile path
66 CSV -> readCsvOn [csv_title, csv_abstract] path
67
68 -- Context :: Text -> [Text]
69 -- Contexts = Paragraphs n | Sentences n | Chars n
70
71 myterms <- extractTerms (Mono FR) contexts
72 -- TermsType = Mono | Multi | MonoMulti
73 -- myterms # filter (\t -> not . elem t stopList)
74 -- # groupBy (Stem|GroupList|Ontology)
75 printDebug "myterms" (sum $ map length myterms)
76
77 -- Bulding the map list
78 -- compute copresences of terms
79 -- Cooc = Map (Term, Term) Int
80 let myCooc1 = cooc myterms
81 printDebug "myCooc1" (M.size myCooc1)
82
83 -- Remove Apax: appears one time only => lighting the matrix
84 let myCooc2 = M.filter (>1) myCooc1
85 printDebug "myCooc2" (M.size myCooc2)
86
87 -- Filtering terms with inclusion/Exclusion and Specifity/Genericity scores
88 let myCooc3 = filterCooc ( FilterConfig (MapListSize 1000 )
89 (InclusionSize 4000 )
90 (SampleBins 10 )
91 (Clusters 3 )
92 (DefaultValue 0 )
93 ) myCooc2
94 printDebug "myCooc3" $ M.size myCooc3
95
96 -- Cooc -> Matrix
97 let (ti, fi) = createIndices myCooc3
98 printDebug "ti" $ M.size ti
99
100 let myCooc4 = toIndex ti myCooc3
101 printDebug "myCooc4" $ M.size myCooc4
102
103 let matCooc = map2mat (0) (M.size ti) myCooc4
104 --printDebug "matCooc" matCooc
105 -- Matrix -> Clustering
106 let distanceMat = conditional matCooc
107 -- let distanceMat = distributional matCooc
108 printDebug "distanceMat" $ A.arrayShape distanceMat
109 --printDebug "distanceMat" distanceMat
110 --
111 let distanceMap = mat2map distanceMat
112 printDebug "distanceMap" $ M.size distanceMap
113 --{-
114 -- let distance = fromIndex fi distanceMap
115 -- printDebug "distance" $ M.size distance
116 ---}
117 partitions <- cLouvain distanceMap
118 ------ | Building : -> Graph -> JSON
119 printDebug "partitions" $ length partitions
120 --printDebug "partitions" partitions
121 pure $ data2graph (M.toList ti) myCooc4 distanceMap partitions
122
123
124 -----------------------------------------------------------
125 -- distance should not be a map since we just "toList" it (same as cLouvain)
126 data2graph :: [(Label, Int)] -> Map (Int, Int) Int
127 -> Map (Int, Int) Double
128 -> [LouvainNode]
129 -> Graph
130 data2graph labels coocs distance partitions = Graph nodes edges
131 where
132 community_id_by_node_id = M.fromList [ (n, c) | LouvainNode n c <- partitions ]
133 nodes = [ Node { n_size = maybe 0 identity (M.lookup (n,n) coocs)
134 , n_type = Terms -- or Unknown
135 , n_id = cs (show n)
136 , n_label = T.unwords l
137 , n_attributes =
138 Attributes { clust_default = maybe 0 identity
139 (M.lookup n community_id_by_node_id) } }
140 | (l, n) <- labels ]
141 edges = [ Edge { e_source = s
142 , e_target = t
143 , e_weight = w
144 , e_id = i }
145 | (i, ((s,t), w)) <- zip [0..] (M.toList distance) ]
146 -----------------------------------------------------------
147
148 printDebug msg x = putStrLn $ msg <> " " <> show x
149 --printDebug _ _ = pure ()
150
151