]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Flow.hs
[FEAT] Adding bridgeness function to filter links between partitions of a graph.
[gargantext.git] / src / Gargantext / Text / Flow.hs
1 {-|
2 Module : Gargantext.Text.Flow
3 Description : Server API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 From text to viz, all the flow of texts in Gargantext.
11
12 -}
13
14 {-# OPTIONS_GHC -fno-warn-name-shadowing #-}
15 {-# LANGUAGE NoImplicitPrelude #-}
16 {-# LANGUAGE OverloadedStrings #-}
17
18 module Gargantext.Text.Flow
19 where
20
21 --import qualified Data.Array.Accelerate as A
22 --import qualified Data.Set as DS
23 import Control.Monad.Reader
24 import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain)
25 import Data.Map.Strict (Map)
26 import Data.Maybe (catMaybes)
27 import qualified Data.Map.Strict as M
28 import qualified Data.Text as T
29 import Data.Text (Text)
30 import Data.Text.IO (readFile)
31 import Database.PostgreSQL.Simple (Connection)
32 import GHC.IO (FilePath)
33 import Gargantext.Core (Lang)
34 import Gargantext.Core.Types (CorpusId)
35 import Gargantext.Database.Schema.Node
36 import Gargantext.Database.Types.Node
37 import Gargantext.Prelude
38 import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
39 import Gargantext.Text.Metrics (filterCooc, FilterConfig(..), Clusters(..), SampleBins(..), DefaultValue(..), MapListSize(..), InclusionSize(..))
40 import Gargantext.Text.Metrics.Count (cooc)
41 import Gargantext.Text.Parsers.CSV
42 import Gargantext.Text.Terms (TermType, extractTerms)
43 import Gargantext.Viz.Graph (Graph(..), data2graph)
44 import Gargantext.Viz.Graph.Bridgeness (bridgeness)
45 import Gargantext.Viz.Graph.Distances.Matrice (measureConditional)
46 import Gargantext.Viz.Graph.Index (createIndices, toIndex, map2mat, mat2map)
47 {-
48 ____ _ _
49 / ___| __ _ _ __ __ _ __ _ _ __ | |_ _____ _| |_
50 | | _ / _` | '__/ _` |/ _` | '_ \| __/ _ \ \/ / __|
51 | |_| | (_| | | | (_| | (_| | | | | || __/> <| |_
52 \____|\__,_|_| \__, |\__,_|_| |_|\__\___/_/\_\\__|
53 |___/
54 -}
55
56
57 contextText :: [T.Text]
58 contextText = ["The dog is an animal."
59 ,"The bird is an animal."
60 ,"The dog is an animal."
61 ,"The animal is a bird or a dog ?"
62 ,"The table is an object."
63 ,"The pen is an object."
64 ,"The object is a pen or a table ?"
65 ,"The girl is a human."
66 ,"The boy is a human."
67 ,"The boy or the girl are human."
68 ]
69
70
71 -- | Control the flow of text
72 data TextFlow = CSV FilePath
73 | FullText FilePath
74 | Contexts [T.Text]
75 | DBV3 Connection CorpusId
76 | Query T.Text
77
78
79 textFlow :: TermType Lang -> TextFlow -> IO Graph
80 textFlow termType workType = do
81 contexts <- case workType of
82 FullText path -> splitBy (Sentences 5) <$> readFile path
83 CSV path -> readCsvOn [csv_title, csv_abstract] path
84 Contexts ctxt -> pure ctxt
85 DBV3 con corpusId -> catMaybes <$> map (\n -> hyperdataDocumentV3_title (_node_hyperdata n) <> hyperdataDocumentV3_abstract (_node_hyperdata n))<$> runReaderT (getDocumentsV3WithParentId corpusId) con
86 _ -> undefined -- TODO Query not supported
87
88 textFlow' termType contexts
89
90
91 textFlow' :: TermType Lang -> [T.Text] -> IO Graph
92 textFlow' termType contexts = do
93 -- Context :: Text -> [Text]
94 -- Contexts = Paragraphs n | Sentences n | Chars n
95
96 myterms <- extractTerms termType contexts
97 -- TermsType = Mono | Multi | MonoMulti
98 -- myterms # filter (\t -> not . elem t stopList)
99 -- # groupBy (Stem|GroupList|Ontology)
100 --printDebug "terms" myterms
101 --printDebug "myterms" (sum $ map length myterms)
102
103 -- Bulding the map list
104 -- compute copresences of terms, i.e. cooccurrences of terms in same context of text
105 -- Cooc = Map (Term, Term) Int
106 let myCooc1 = cooc myterms
107 --printDebug "myCooc1 size" (M.size myCooc1)
108
109 -- Remove Apax: appears one time only => lighting the matrix
110 let myCooc2 = M.filter (>0) myCooc1
111 --printDebug "myCooc2 size" (M.size myCooc2)
112 --printDebug "myCooc2" myCooc2
113 g <- cooc2graph myCooc2
114 pure g
115
116 -- TODO use Text only here instead of [Text]
117 cooc2graph :: (Map ([Text], [Text]) Int) -> IO Graph
118 cooc2graph myCooc = do
119 --printDebug "myCooc" myCooc
120 -- Filtering terms with inclusion/Exclusion and Specificity/Genericity scores
121 let myCooc3 = filterCooc ( FilterConfig (MapListSize 350 )
122 (InclusionSize 500 )
123 (SampleBins 10 )
124 (Clusters 3 )
125 (DefaultValue 0 )
126 ) myCooc
127 --printDebug "myCooc3 size" $ M.size myCooc3
128 --printDebug "myCooc3" myCooc3
129
130 -- Cooc -> Matrix
131 let (ti, _) = createIndices myCooc3
132 --printDebug "ti size" $ M.size ti
133 --printDebug "ti" ti
134
135 let myCooc4 = toIndex ti myCooc3
136 --printDebug "myCooc4 size" $ M.size myCooc4
137 --printDebug "myCooc4" myCooc4
138
139 let matCooc = map2mat (0) (M.size ti) myCooc4
140 --printDebug "matCooc shape" $ A.arrayShape matCooc
141 --printDebug "matCooc" matCooc
142
143 -- Matrix -> Clustering
144 let distanceMat = measureConditional matCooc
145 --let distanceMat = distributional matCooc
146 --printDebug "distanceMat shape" $ A.arrayShape distanceMat
147 --printDebug "distanceMat" distanceMat
148
149 --let distanceMap = M.filter (>0) $ mat2map distanceMat
150 let distanceMap = M.map (\_ -> 1) $ M.filter (>0) $ mat2map distanceMat
151 --printDebug "distanceMap size" $ M.size distanceMap
152 --printDebug "distanceMap" distanceMap
153
154 -- let distance = fromIndex fi distanceMap
155 --printDebug "distance" $ M.size distance
156
157 partitions <- case M.size distanceMap > 0 of
158 True -> cLouvain distanceMap
159 False -> panic "Text.Flow: DistanceMap is empty"
160 -- Building : -> Graph -> JSON
161 --printDebug "partitions" $ DS.size $ DS.fromList $ map (l_community_id) partitions
162 --printDebug "partitions" partitions
163 let distanceMap' = bridgeness 300 partitions distanceMap
164 pure $ data2graph (M.toList ti) myCooc4 distanceMap' partitions
165