]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Flow.hs
[Phylo] Example Phylo' type.
[gargantext.git] / src / Gargantext / Text / Flow.hs
1 {-|
2 Module : Gargantext.Text.Flow
3 Description : Server API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 From text to viz, all the flow of texts in Gargantext.
11
12 -}
13
14 {-# OPTIONS_GHC -fno-warn-name-shadowing #-}
15 {-# LANGUAGE NoImplicitPrelude #-}
16 {-# LANGUAGE OverloadedStrings #-}
17
18 module Gargantext.Text.Flow
19 where
20
21 import Control.Monad.Reader
22 import GHC.IO (FilePath)
23 import qualified Data.Text as T
24 import Data.Text.IO (readFile)
25
26 import Data.Map.Strict (Map)
27 import Data.Maybe (catMaybes)
28 --import qualified Data.Set as DS
29 import Data.Text (Text)
30
31 --import qualified Data.Array.Accelerate as A
32 import qualified Data.Map.Strict as M
33 ----------------------------------------------
34 import Database.PostgreSQL.Simple (Connection)
35
36 import Gargantext.Database.Schema.Node
37 import Gargantext.Database.Types.Node
38
39 import Gargantext.Core (Lang)
40 import Gargantext.Prelude
41
42 import Gargantext.Viz.Graph.Index (createIndices, toIndex, map2mat, mat2map)
43 import Gargantext.Viz.Graph.Distances.Matrice (measureConditional)
44 import Gargantext.Viz.Graph (Graph(..), data2graph)
45 import Gargantext.Text.Metrics.Count (cooc)
46 import Gargantext.Viz.Graph.Bridgeness (bridgeness)
47 import Gargantext.Text.Metrics (filterCooc, FilterConfig(..), Clusters(..), SampleBins(..), DefaultValue(..), MapListSize(..), InclusionSize(..))
48 import Gargantext.Text.Terms (TermType, extractTerms)
49 import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
50 import Gargantext.Core.Types (CorpusId)
51
52 import Gargantext.Text.Parsers.CSV
53
54 import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain)
55
56 {-
57 ____ _ _
58 / ___| __ _ _ __ __ _ __ _ _ __ | |_ _____ _| |_
59 | | _ / _` | '__/ _` |/ _` | '_ \| __/ _ \ \/ / __|
60 | |_| | (_| | | | (_| | (_| | | | | || __/> <| |_
61 \____|\__,_|_| \__, |\__,_|_| |_|\__\___/_/\_\\__|
62 |___/
63 -}
64
65
66 contextText :: [T.Text]
67 contextText = ["The dog is an animal."
68 ,"The bird is an animal."
69 ,"The dog is an animal."
70 ,"The animal is a bird or a dog ?"
71 ,"The table is an object."
72 ,"The pen is an object."
73 ,"The object is a pen or a table ?"
74 ,"The girl is a human."
75 ,"The boy is a human."
76 ,"The boy or the girl are human."
77 ]
78
79
80 -- | Control the flow of text
81 data TextFlow = CSV FilePath
82 | FullText FilePath
83 | Contexts [T.Text]
84 | DBV3 Connection CorpusId
85 | Query T.Text
86
87
88 textFlow :: TermType Lang -> TextFlow -> IO Graph
89 textFlow termType workType = do
90 contexts <- case workType of
91 FullText path -> splitBy (Sentences 5) <$> readFile path
92 CSV path -> readCsvOn [csv_title, csv_abstract] path
93 Contexts ctxt -> pure ctxt
94 DBV3 con corpusId -> catMaybes <$> map (\n -> hyperdataDocumentV3_title (_node_hyperdata n) <> hyperdataDocumentV3_abstract (_node_hyperdata n))<$> runReaderT (getDocumentsV3WithParentId corpusId) con
95 _ -> undefined -- TODO Query not supported
96
97 textFlow' termType contexts
98
99
100 textFlow' :: TermType Lang -> [T.Text] -> IO Graph
101 textFlow' termType contexts = do
102 -- Context :: Text -> [Text]
103 -- Contexts = Paragraphs n | Sentences n | Chars n
104
105 myterms <- extractTerms termType contexts
106 -- TermsType = Mono | Multi | MonoMulti
107 -- myterms # filter (\t -> not . elem t stopList)
108 -- # groupBy (Stem|GroupList|Ontology)
109 --printDebug "terms" myterms
110 --printDebug "myterms" (sum $ map length myterms)
111
112 -- Bulding the map list
113 -- compute copresences of terms, i.e. cooccurrences of terms in same context of text
114 -- Cooc = Map (Term, Term) Int
115 let myCooc1 = cooc myterms
116 --printDebug "myCooc1 size" (M.size myCooc1)
117
118 -- Remove Apax: appears one time only => lighting the matrix
119 let myCooc2 = M.filter (>0) myCooc1
120 --printDebug "myCooc2 size" (M.size myCooc2)
121 --printDebug "myCooc2" myCooc2
122 g <- cooc2graph myCooc2
123 pure g
124
125 -- TODO use Text only here instead of [Text]
126 cooc2graph :: (Map ([Text], [Text]) Int) -> IO Graph
127 cooc2graph myCooc = do
128 --printDebug "myCooc" myCooc
129 -- Filtering terms with inclusion/Exclusion and Specificity/Genericity scores
130 let myCooc3 = filterCooc ( FilterConfig (MapListSize 350 )
131 (InclusionSize 500 )
132 (SampleBins 10 )
133 (Clusters 3 )
134 (DefaultValue 0 )
135 ) myCooc
136 --printDebug "myCooc3 size" $ M.size myCooc3
137 --printDebug "myCooc3" myCooc3
138
139 -- Cooc -> Matrix
140 let (ti, _) = createIndices myCooc3
141 --printDebug "ti size" $ M.size ti
142 --printDebug "ti" ti
143
144 let myCooc4 = toIndex ti myCooc3
145 --printDebug "myCooc4 size" $ M.size myCooc4
146 --printDebug "myCooc4" myCooc4
147
148 let matCooc = map2mat (0) (M.size ti) myCooc4
149 --printDebug "matCooc shape" $ A.arrayShape matCooc
150 --printDebug "matCooc" matCooc
151
152 -- Matrix -> Clustering
153 let distanceMat = measureConditional matCooc
154 --let distanceMat = distributional matCooc
155 --printDebug "distanceMat shape" $ A.arrayShape distanceMat
156 --printDebug "distanceMat" distanceMat
157
158 --let distanceMap = M.filter (>0) $ mat2map distanceMat
159 let distanceMap = M.map (\_ -> 1) $ M.filter (>0) $ mat2map distanceMat
160 --printDebug "distanceMap size" $ M.size distanceMap
161 --printDebug "distanceMap" distanceMap
162
163 -- let distance = fromIndex fi distanceMap
164 --printDebug "distance" $ M.size distance
165
166 partitions <- case M.size distanceMap > 0 of
167 True -> cLouvain distanceMap
168 False -> panic "Text.Flow: DistanceMap is empty"
169 -- Building : -> Graph -> JSON
170 --printDebug "partitions" $ DS.size $ DS.fromList $ map (l_community_id) partitions
171 --printDebug "partitions" partitions
172 let distanceMap' = bridgeness 5 partitions distanceMap
173 pure $ data2graph (M.toList ti) myCooc4 distanceMap' partitions
174