2 Module : Gargantext.Text.Flow
3 Description : Server API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 From text to viz, all the flow of texts in Gargantext.
14 {-# OPTIONS_GHC -fno-warn-name-shadowing #-}
15 {-# LANGUAGE NoImplicitPrelude #-}
16 {-# LANGUAGE OverloadedStrings #-}
18 module Gargantext.Text.Flow
21 import Control.Monad.Reader
22 import GHC.IO (FilePath)
23 import qualified Data.Text as T
24 import Data.Text.IO (readFile)
26 import Data.Map.Strict (Map)
27 import Data.Maybe (catMaybes)
28 import qualified Data.Set as DS
29 import Data.Text (Text)
31 import qualified Data.Array.Accelerate as A
32 import qualified Data.Map.Strict as M
33 ----------------------------------------------
34 import Database.PostgreSQL.Simple (Connection)
36 import Gargantext.Database.Schema.Node
37 import Gargantext.Database.Types.Node
39 import Gargantext.Core (Lang)
40 import Gargantext.Prelude
42 import Gargantext.Viz.Graph.Index (createIndices, toIndex, map2mat, mat2map)
43 import Gargantext.Viz.Graph.Distances.Matrice (measureConditional)
44 import Gargantext.Viz.Graph (Graph(..), data2graph)
45 import Gargantext.Text.Metrics.Count (cooc)
46 import Gargantext.Text.Metrics (filterCooc, FilterConfig(..), Clusters(..), SampleBins(..), DefaultValue(..), MapListSize(..), InclusionSize(..))
47 import Gargantext.Text.Terms (TermType, extractTerms)
48 import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
49 import Gargantext.Core.Types (CorpusId)
51 import Gargantext.Text.Parsers.CSV
53 import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain, l_community_id)
57 / ___| __ _ _ __ __ _ __ _ _ __ | |_ _____ _| |_
58 | | _ / _` | '__/ _` |/ _` | '_ \| __/ _ \ \/ / __|
59 | |_| | (_| | | | (_| | (_| | | | | || __/> <| |_
60 \____|\__,_|_| \__, |\__,_|_| |_|\__\___/_/\_\\__|
65 contextText :: [T.Text]
66 contextText = ["The dog is an animal."
67 ,"The bird is an animal."
68 ,"The dog is an animal."
69 ,"The animal is a bird or a dog ?"
70 ,"The table is an object."
71 ,"The pen is an object."
72 ,"The object is a pen or a table ?"
73 ,"The girl is a human."
74 ,"The boy is a human."
75 ,"The boy or the girl are human."
79 -- | Control the flow of text
80 data TextFlow = CSV FilePath
83 | DBV3 Connection CorpusId
87 textFlow :: TermType Lang -> TextFlow -> IO Graph
88 textFlow termType workType = do
89 contexts <- case workType of
90 FullText path -> splitBy (Sentences 5) <$> readFile path
91 CSV path -> readCsvOn [csv_title, csv_abstract] path
92 Contexts ctxt -> pure ctxt
93 DBV3 con corpusId -> catMaybes <$> map (\n -> hyperdataDocumentV3_title (_node_hyperdata n) <> hyperdataDocumentV3_abstract (_node_hyperdata n))<$> runReaderT (getDocumentsV3WithParentId corpusId) con
94 _ -> undefined -- TODO Query not supported
96 textFlow' termType contexts
99 textFlow' :: TermType Lang -> [T.Text] -> IO Graph
100 textFlow' termType contexts = do
101 -- Context :: Text -> [Text]
102 -- Contexts = Paragraphs n | Sentences n | Chars n
104 myterms <- extractTerms termType contexts
105 -- TermsType = Mono | Multi | MonoMulti
106 -- myterms # filter (\t -> not . elem t stopList)
107 -- # groupBy (Stem|GroupList|Ontology)
108 printDebug "terms" myterms
109 printDebug "myterms" (sum $ map length myterms)
111 -- Bulding the map list
112 -- compute copresences of terms, i.e. cooccurrences of terms in same context of text
113 -- Cooc = Map (Term, Term) Int
114 let myCooc1 = cooc myterms
115 printDebug "myCooc1 size" (M.size myCooc1)
117 -- Remove Apax: appears one time only => lighting the matrix
118 let myCooc2 = M.filter (>0) myCooc1
119 printDebug "myCooc2 size" (M.size myCooc2)
120 printDebug "myCooc2" myCooc2
121 g <- cooc2graph myCooc2
124 -- TODO use Text only here instead of [Text]
125 cooc2graph :: (Map ([Text], [Text]) Int) -> IO Graph
126 cooc2graph myCooc = do
128 -- Filtering terms with inclusion/Exclusion and Specificity/Genericity scores
129 let myCooc3 = filterCooc ( FilterConfig (MapListSize 350 )
135 printDebug "myCooc3 size" $ M.size myCooc3
136 printDebug "myCooc3" myCooc3
139 let (ti, _) = createIndices myCooc3
140 printDebug "ti size" $ M.size ti
143 let myCooc4 = toIndex ti myCooc3
144 printDebug "myCooc4 size" $ M.size myCooc4
145 printDebug "myCooc4" myCooc4
147 let matCooc = map2mat (0) (M.size ti) myCooc4
148 printDebug "matCooc shape" $ A.arrayShape matCooc
149 printDebug "matCooc" matCooc
151 -- Matrix -> Clustering
152 let distanceMat = measureConditional matCooc
153 --let distanceMat = distributional matCooc
154 printDebug "distanceMat shape" $ A.arrayShape distanceMat
155 printDebug "distanceMat" distanceMat
157 --let distanceMap = M.filter (>0) $ mat2map distanceMat
158 let distanceMap = M.map (\_ -> 1) $ M.filter (>0) $ mat2map distanceMat
159 printDebug "distanceMap size" $ M.size distanceMap
160 printDebug "distanceMap" distanceMap
162 -- let distance = fromIndex fi distanceMap
163 -- printDebug "distance" $ M.size distance
165 partitions <- cLouvain distanceMap
166 -- Building : -> Graph -> JSON
167 printDebug "partitions" $ DS.size $ DS.fromList $ map (l_community_id) partitions
168 --printDebug "partitions" partitions
169 pure $ data2graph (M.toList ti) myCooc4 distanceMap partitions