2 Module : Gargantext.Text.Flow
3 Description : Server API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 From text to viz, all the flow of texts in Gargantext.
14 {-# OPTIONS_GHC -fno-warn-name-shadowing #-}
15 {-# LANGUAGE NoImplicitPrelude #-}
16 {-# LANGUAGE OverloadedStrings #-}
18 module Gargantext.Text.Flow
21 import Control.Monad.Reader
22 import GHC.IO (FilePath)
23 import qualified Data.Text as T
24 import Data.Text.IO (readFile)
26 import Data.Map.Strict (Map)
27 import Data.Maybe (catMaybes)
28 --import qualified Data.Set as DS
29 import Data.Text (Text)
31 --import qualified Data.Array.Accelerate as A
32 import qualified Data.Map.Strict as M
33 ----------------------------------------------
34 import Database.PostgreSQL.Simple (Connection)
36 import Gargantext.Database.Schema.Node
37 import Gargantext.Database.Types.Node
39 import Gargantext.Core (Lang)
40 import Gargantext.Prelude
42 import Gargantext.Viz.Graph.Index (createIndices, toIndex, map2mat, mat2map)
43 import Gargantext.Viz.Graph.Distances.Matrice (measureConditional)
44 import Gargantext.Viz.Graph (Graph(..), data2graph)
45 import Gargantext.Text.Metrics.Count (cooc)
46 import Gargantext.Text.Metrics (filterCooc, FilterConfig(..), Clusters(..), SampleBins(..), DefaultValue(..), MapListSize(..), InclusionSize(..))
47 import Gargantext.Text.Terms (TermType, extractTerms)
48 import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
49 import Gargantext.Core.Types (CorpusId)
51 import Gargantext.Text.Parsers.CSV
53 import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain)
57 / ___| __ _ _ __ __ _ __ _ _ __ | |_ _____ _| |_
58 | | _ / _` | '__/ _` |/ _` | '_ \| __/ _ \ \/ / __|
59 | |_| | (_| | | | (_| | (_| | | | | || __/> <| |_
60 \____|\__,_|_| \__, |\__,_|_| |_|\__\___/_/\_\\__|
65 contextText :: [T.Text]
66 contextText = ["The dog is an animal."
67 ,"The bird is an animal."
68 ,"The dog is an animal."
69 ,"The animal is a bird or a dog ?"
70 ,"The table is an object."
71 ,"The pen is an object."
72 ,"The object is a pen or a table ?"
73 ,"The girl is a human."
74 ,"The boy is a human."
75 ,"The boy or the girl are human."
79 -- | Control the flow of text
80 data TextFlow = CSV FilePath
83 | DBV3 Connection CorpusId
87 textFlow :: TermType Lang -> TextFlow -> IO Graph
88 textFlow termType workType = do
89 contexts <- case workType of
90 FullText path -> splitBy (Sentences 5) <$> readFile path
91 CSV path -> readCsvOn [csv_title, csv_abstract] path
92 Contexts ctxt -> pure ctxt
93 DBV3 con corpusId -> catMaybes <$> map (\n -> hyperdataDocumentV3_title (_node_hyperdata n) <> hyperdataDocumentV3_abstract (_node_hyperdata n))<$> runReaderT (getDocumentsV3WithParentId corpusId) con
94 _ -> undefined -- TODO Query not supported
96 textFlow' termType contexts
99 textFlow' :: TermType Lang -> [T.Text] -> IO Graph
100 textFlow' termType contexts = do
101 -- Context :: Text -> [Text]
102 -- Contexts = Paragraphs n | Sentences n | Chars n
104 myterms <- extractTerms termType contexts
105 -- TermsType = Mono | Multi | MonoMulti
106 -- myterms # filter (\t -> not . elem t stopList)
107 -- # groupBy (Stem|GroupList|Ontology)
108 --printDebug "terms" myterms
109 --printDebug "myterms" (sum $ map length myterms)
111 -- Bulding the map list
112 -- compute copresences of terms, i.e. cooccurrences of terms in same context of text
113 -- Cooc = Map (Term, Term) Int
114 let myCooc1 = cooc myterms
115 --printDebug "myCooc1 size" (M.size myCooc1)
117 -- Remove Apax: appears one time only => lighting the matrix
118 let myCooc2 = M.filter (>0) myCooc1
119 --printDebug "myCooc2 size" (M.size myCooc2)
120 --printDebug "myCooc2" myCooc2
121 g <- cooc2graph myCooc2
124 -- TODO use Text only here instead of [Text]
125 cooc2graph :: (Map ([Text], [Text]) Int) -> IO Graph
126 cooc2graph myCooc = do
127 --printDebug "myCooc" myCooc
128 -- Filtering terms with inclusion/Exclusion and Specificity/Genericity scores
129 let myCooc3 = filterCooc ( FilterConfig (MapListSize 350 )
135 --printDebug "myCooc3 size" $ M.size myCooc3
136 --printDebug "myCooc3" myCooc3
139 let (ti, _) = createIndices myCooc3
140 --printDebug "ti size" $ M.size ti
143 let myCooc4 = toIndex ti myCooc3
144 --printDebug "myCooc4 size" $ M.size myCooc4
145 --printDebug "myCooc4" myCooc4
147 let matCooc = map2mat (0) (M.size ti) myCooc4
148 --printDebug "matCooc shape" $ A.arrayShape matCooc
149 --printDebug "matCooc" matCooc
151 -- Matrix -> Clustering
152 let distanceMat = measureConditional matCooc
153 --let distanceMat = distributional matCooc
154 --printDebug "distanceMat shape" $ A.arrayShape distanceMat
155 --printDebug "distanceMat" distanceMat
157 --let distanceMap = M.filter (>0) $ mat2map distanceMat
158 let distanceMap = M.map (\_ -> 1) $ M.filter (>0) $ mat2map distanceMat
159 --printDebug "distanceMap size" $ M.size distanceMap
160 --printDebug "distanceMap" distanceMap
162 -- let distance = fromIndex fi distanceMap
163 --printDebug "distance" $ M.size distance
165 partitions <- case M.size distanceMap > 0 of
166 True -> cLouvain distanceMap
167 False -> panic "Text.Flow: DistanceMap is empty"
168 -- Building : -> Graph -> JSON
169 --printDebug "partitions" $ DS.size $ DS.fromList $ map (l_community_id) partitions
170 --printDebug "partitions" partitions
171 pure $ data2graph (M.toList ti) myCooc4 distanceMap partitions