]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Flow.hs
[WIP] Connecting graph.
[gargantext.git] / src / Gargantext / Text / Flow.hs
1 {-|
2 Module : Gargantext.Text.Flow
3 Description : Server API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 From text to viz, all the flow of texts in Gargantext.
11
12 -}
13
14 {-# OPTIONS_GHC -fno-warn-name-shadowing #-}
15 {-# LANGUAGE NoImplicitPrelude #-}
16 {-# LANGUAGE OverloadedStrings #-}
17
18 module Gargantext.Text.Flow
19 where
20
21 import Control.Monad.Reader
22 import GHC.IO (FilePath)
23 import qualified Data.Text as T
24 import Data.Text.IO (readFile)
25
26 import Data.Map.Strict (Map)
27 import Data.Maybe (catMaybes)
28 import qualified Data.Set as DS
29 import Data.Text (Text)
30
31 import qualified Data.Array.Accelerate as A
32 import qualified Data.Map.Strict as M
33 ----------------------------------------------
34 import Database.PostgreSQL.Simple (Connection)
35
36 import Gargantext.Database.Schema.Node
37 import Gargantext.Database.Types.Node
38 import Gargantext.Database.Utils (Cmd, mkCmd)
39
40 import Gargantext.Core (Lang)
41 import Gargantext.Prelude
42
43 import Gargantext.Viz.Graph.Index (createIndices, toIndex, map2mat, mat2map)
44 import Gargantext.Viz.Graph.Distances.Matrice (measureConditional)
45 import Gargantext.Viz.Graph (Graph(..), data2graph)
46 import Gargantext.Text.Metrics.Count (cooc)
47 import Gargantext.Text.Metrics (filterCooc, FilterConfig(..), Clusters(..), SampleBins(..), DefaultValue(..), MapListSize(..), InclusionSize(..))
48 import Gargantext.Text.Terms (TermType, extractTerms)
49 import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
50 import Gargantext.Core.Types (CorpusId)
51
52 import Gargantext.Text.Parsers.CSV
53
54 import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain, l_community_id)
55
56 {-
57 ____ _ _
58 / ___| __ _ _ __ __ _ __ _ _ __ | |_ _____ _| |_
59 | | _ / _` | '__/ _` |/ _` | '_ \| __/ _ \ \/ / __|
60 | |_| | (_| | | | (_| | (_| | | | | || __/> <| |_
61 \____|\__,_|_| \__, |\__,_|_| |_|\__\___/_/\_\\__|
62 |___/
63 -}
64
65
66 contextText :: [T.Text]
67 contextText = ["The dog is an animal."
68 ,"The bird is an animal."
69 ,"The dog is an animal."
70 ,"The animal is a bird or a dog ?"
71 ,"The table is an object."
72 ,"The pen is an object."
73 ,"The object is a pen or a table ?"
74 ,"The girl is a human."
75 ,"The boy is a human."
76 ,"The boy or the girl are human."
77 ]
78
79
80 -- | Control the flow of text
81 data TextFlow = CSV FilePath
82 | FullText FilePath
83 | Contexts [T.Text]
84 | DBV3 Connection CorpusId
85 | Query T.Text
86
87
88 textFlow :: TermType Lang -> TextFlow -> IO Graph
89 textFlow termType workType = do
90 contexts <- case workType of
91 FullText path -> splitBy (Sentences 5) <$> readFile path
92 CSV path -> readCsvOn [csv_title, csv_abstract] path
93 Contexts ctxt -> pure ctxt
94 DBV3 con corpusId -> catMaybes <$> map (\n -> hyperdataDocumentV3_title (_node_hyperdata n) <> hyperdataDocumentV3_abstract (_node_hyperdata n))<$> runReaderT (getDocumentsV3WithParentId corpusId) con
95 _ -> undefined -- TODO Query not supported
96
97 textFlow' termType contexts
98
99
100 textFlow' :: TermType Lang -> [T.Text] -> IO Graph
101 textFlow' termType contexts = do
102 -- Context :: Text -> [Text]
103 -- Contexts = Paragraphs n | Sentences n | Chars n
104
105 myterms <- extractTerms termType contexts
106 -- TermsType = Mono | Multi | MonoMulti
107 -- myterms # filter (\t -> not . elem t stopList)
108 -- # groupBy (Stem|GroupList|Ontology)
109 printDebug "terms" myterms
110 printDebug "myterms" (sum $ map length myterms)
111
112 -- Bulding the map list
113 -- compute copresences of terms, i.e. cooccurrences of terms in same context of text
114 -- Cooc = Map (Term, Term) Int
115 let myCooc1 = cooc myterms
116 printDebug "myCooc1 size" (M.size myCooc1)
117
118 -- Remove Apax: appears one time only => lighting the matrix
119 let myCooc2 = M.filter (>0) myCooc1
120 printDebug "myCooc2 size" (M.size myCooc2)
121 printDebug "myCooc2" myCooc2
122 g <- cooc2graph myCooc2
123 pure g
124
125 -- TODO use Text only here instead of [Text]
126 cooc2graph :: (Map ([Text], [Text]) Int) -> IO Graph
127 cooc2graph myCooc = do
128
129 -- Filtering terms with inclusion/Exclusion and Specificity/Genericity scores
130 let myCooc3 = filterCooc ( FilterConfig (MapListSize 350 )
131 (InclusionSize 500 )
132 (SampleBins 10 )
133 (Clusters 3 )
134 (DefaultValue 0 )
135 ) myCooc
136 printDebug "myCooc3 size" $ M.size myCooc3
137 printDebug "myCooc3" myCooc3
138
139 -- Cooc -> Matrix
140 let (ti, _) = createIndices myCooc3
141 printDebug "ti size" $ M.size ti
142 printDebug "ti" ti
143
144 let myCooc4 = toIndex ti myCooc3
145 printDebug "myCooc4 size" $ M.size myCooc4
146 printDebug "myCooc4" myCooc4
147
148 let matCooc = map2mat (0) (M.size ti) myCooc4
149 printDebug "matCooc shape" $ A.arrayShape matCooc
150 printDebug "matCooc" matCooc
151
152 -- Matrix -> Clustering
153 let distanceMat = measureConditional matCooc
154 --let distanceMat = distributional matCooc
155 printDebug "distanceMat shape" $ A.arrayShape distanceMat
156 printDebug "distanceMat" distanceMat
157
158 --let distanceMap = M.filter (>0) $ mat2map distanceMat
159 let distanceMap = M.map (\_ -> 1) $ M.filter (>0) $ mat2map distanceMat
160 printDebug "distanceMap size" $ M.size distanceMap
161 printDebug "distanceMap" distanceMap
162
163 -- let distance = fromIndex fi distanceMap
164 -- printDebug "distance" $ M.size distance
165
166 partitions <- cLouvain distanceMap
167 -- Building : -> Graph -> JSON
168 printDebug "partitions" $ DS.size $ DS.fromList $ map (l_community_id) partitions
169 --printDebug "partitions" partitions
170 pure $ data2graph (M.toList ti) myCooc4 distanceMap partitions
171