]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Flow.hs
[Graph][WIP] Cooc 2 graph and missing file.
[gargantext.git] / src / Gargantext / Text / Flow.hs
1 {-|
2 Module : Gargantext.Text.Flow
3 Description : Server API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 From text to viz, all the flow of texts in Gargantext.
11
12 -}
13
14 {-# OPTIONS_GHC -fno-warn-name-shadowing #-}
15 {-# LANGUAGE NoImplicitPrelude #-}
16 {-# LANGUAGE OverloadedStrings #-}
17
18 module Gargantext.Text.Flow
19 where
20
21 import Control.Monad.Reader
22 import GHC.IO (FilePath)
23 import qualified Data.Text as T
24 import Data.Text.IO (readFile)
25
26 import Data.Map.Strict (Map)
27 import Data.Maybe (catMaybes)
28 import qualified Data.Set as DS
29 import Data.Text (Text)
30
31 import qualified Data.Array.Accelerate as A
32 import qualified Data.Map.Strict as M
33 ----------------------------------------------
34 import Database.PostgreSQL.Simple (Connection)
35
36 import Gargantext.Database.Schema.Node
37 import Gargantext.Database.Types.Node
38
39 import Gargantext.Core (Lang)
40 import Gargantext.Prelude
41
42 import Gargantext.Viz.Graph.Index (createIndices, toIndex, map2mat, mat2map)
43 import Gargantext.Viz.Graph.Distances.Matrice (measureConditional)
44 import Gargantext.Viz.Graph (Graph(..), data2graph)
45 import Gargantext.Text.Metrics.Count (cooc)
46 import Gargantext.Text.Metrics (filterCooc, FilterConfig(..), Clusters(..), SampleBins(..), DefaultValue(..), MapListSize(..), InclusionSize(..))
47 import Gargantext.Text.Terms (TermType, extractTerms)
48 import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
49 import Gargantext.Core.Types (CorpusId)
50
51 import Gargantext.Text.Parsers.CSV
52
53 import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain, l_community_id)
54
55 {-
56 ____ _ _
57 / ___| __ _ _ __ __ _ __ _ _ __ | |_ _____ _| |_
58 | | _ / _` | '__/ _` |/ _` | '_ \| __/ _ \ \/ / __|
59 | |_| | (_| | | | (_| | (_| | | | | || __/> <| |_
60 \____|\__,_|_| \__, |\__,_|_| |_|\__\___/_/\_\\__|
61 |___/
62 -}
63
64
65 contextText :: [T.Text]
66 contextText = ["The dog is an animal."
67 ,"The bird is an animal."
68 ,"The dog is an animal."
69 ,"The animal is a bird or a dog ?"
70 ,"The table is an object."
71 ,"The pen is an object."
72 ,"The object is a pen or a table ?"
73 ,"The girl is a human."
74 ,"The boy is a human."
75 ,"The boy or the girl are human."
76 ]
77
78
79 -- | Control the flow of text
80 data TextFlow = CSV FilePath
81 | FullText FilePath
82 | Contexts [T.Text]
83 | DBV3 Connection CorpusId
84 | Query T.Text
85
86
87 textFlow :: TermType Lang -> TextFlow -> IO Graph
88 textFlow termType workType = do
89 contexts <- case workType of
90 FullText path -> splitBy (Sentences 5) <$> readFile path
91 CSV path -> readCsvOn [csv_title, csv_abstract] path
92 Contexts ctxt -> pure ctxt
93 DBV3 con corpusId -> catMaybes <$> map (\n -> hyperdataDocumentV3_title (_node_hyperdata n) <> hyperdataDocumentV3_abstract (_node_hyperdata n))<$> runReaderT (getDocumentsV3WithParentId corpusId) con
94 _ -> undefined -- TODO Query not supported
95
96 textFlow' termType contexts
97
98
99 textFlow' :: TermType Lang -> [T.Text] -> IO Graph
100 textFlow' termType contexts = do
101 -- Context :: Text -> [Text]
102 -- Contexts = Paragraphs n | Sentences n | Chars n
103
104 myterms <- extractTerms termType contexts
105 -- TermsType = Mono | Multi | MonoMulti
106 -- myterms # filter (\t -> not . elem t stopList)
107 -- # groupBy (Stem|GroupList|Ontology)
108 printDebug "terms" myterms
109 printDebug "myterms" (sum $ map length myterms)
110
111 -- Bulding the map list
112 -- compute copresences of terms, i.e. cooccurrences of terms in same context of text
113 -- Cooc = Map (Term, Term) Int
114 let myCooc1 = cooc myterms
115 printDebug "myCooc1 size" (M.size myCooc1)
116
117 -- Remove Apax: appears one time only => lighting the matrix
118 let myCooc2 = M.filter (>0) myCooc1
119 printDebug "myCooc2 size" (M.size myCooc2)
120 printDebug "myCooc2" myCooc2
121 g <- cooc2graph myCooc2
122 pure g
123
124 -- TODO use Text only here instead of [Text]
125 cooc2graph :: (Map ([Text], [Text]) Int) -> IO Graph
126 cooc2graph myCooc = do
127
128 -- Filtering terms with inclusion/Exclusion and Specificity/Genericity scores
129 let myCooc3 = filterCooc ( FilterConfig (MapListSize 350 )
130 (InclusionSize 500 )
131 (SampleBins 10 )
132 (Clusters 3 )
133 (DefaultValue 0 )
134 ) myCooc
135 printDebug "myCooc3 size" $ M.size myCooc3
136 printDebug "myCooc3" myCooc3
137
138 -- Cooc -> Matrix
139 let (ti, _) = createIndices myCooc3
140 printDebug "ti size" $ M.size ti
141 printDebug "ti" ti
142
143 let myCooc4 = toIndex ti myCooc3
144 printDebug "myCooc4 size" $ M.size myCooc4
145 printDebug "myCooc4" myCooc4
146
147 let matCooc = map2mat (0) (M.size ti) myCooc4
148 printDebug "matCooc shape" $ A.arrayShape matCooc
149 printDebug "matCooc" matCooc
150
151 -- Matrix -> Clustering
152 let distanceMat = measureConditional matCooc
153 --let distanceMat = distributional matCooc
154 printDebug "distanceMat shape" $ A.arrayShape distanceMat
155 printDebug "distanceMat" distanceMat
156
157 --let distanceMap = M.filter (>0) $ mat2map distanceMat
158 let distanceMap = M.map (\_ -> 1) $ M.filter (>0) $ mat2map distanceMat
159 printDebug "distanceMap size" $ M.size distanceMap
160 printDebug "distanceMap" distanceMap
161
162 -- let distance = fromIndex fi distanceMap
163 -- printDebug "distance" $ M.size distance
164
165 partitions <- cLouvain distanceMap
166 -- Building : -> Graph -> JSON
167 printDebug "partitions" $ DS.size $ DS.fromList $ map (l_community_id) partitions
168 --printDebug "partitions" partitions
169 pure $ data2graph (M.toList ti) myCooc4 distanceMap partitions
170