2 Module : Gargantext.Text.Flow
3 Description : Server API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 From text to viz, all the flow of texts in Gargantext.
14 {-# OPTIONS_GHC -fno-warn-name-shadowing #-}
15 {-# LANGUAGE NoImplicitPrelude #-}
16 {-# LANGUAGE OverloadedStrings #-}
18 module Gargantext.Text.Flow
21 import Control.Monad.Reader
22 import GHC.IO (FilePath)
23 import qualified Data.Text as T
24 import Data.Text.IO (readFile)
26 import Data.Map.Strict (Map)
27 import Data.Maybe (catMaybes)
28 import qualified Data.Set as DS
29 import Data.Text (Text)
31 import qualified Data.Array.Accelerate as A
32 import qualified Data.Map.Strict as M
33 ----------------------------------------------
34 import Database.PostgreSQL.Simple (Connection)
36 import Gargantext.Database.Schema.Node
37 import Gargantext.Database.Types.Node
38 import Gargantext.Database.Utils (Cmd, mkCmd)
40 import Gargantext.Core (Lang)
41 import Gargantext.Prelude
43 import Gargantext.Viz.Graph.Index (createIndices, toIndex, map2mat, mat2map)
44 import Gargantext.Viz.Graph.Distances.Matrice (measureConditional)
45 import Gargantext.Viz.Graph (Graph(..), data2graph)
46 import Gargantext.Text.Metrics.Count (cooc)
47 import Gargantext.Text.Metrics (filterCooc, FilterConfig(..), Clusters(..), SampleBins(..), DefaultValue(..), MapListSize(..), InclusionSize(..))
48 import Gargantext.Text.Terms (TermType, extractTerms)
49 import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
50 import Gargantext.Core.Types (CorpusId)
52 import Gargantext.Text.Parsers.CSV
54 import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain, l_community_id)
58 / ___| __ _ _ __ __ _ __ _ _ __ | |_ _____ _| |_
59 | | _ / _` | '__/ _` |/ _` | '_ \| __/ _ \ \/ / __|
60 | |_| | (_| | | | (_| | (_| | | | | || __/> <| |_
61 \____|\__,_|_| \__, |\__,_|_| |_|\__\___/_/\_\\__|
66 contextText :: [T.Text]
67 contextText = ["The dog is an animal."
68 ,"The bird is an animal."
69 ,"The dog is an animal."
70 ,"The animal is a bird or a dog ?"
71 ,"The table is an object."
72 ,"The pen is an object."
73 ,"The object is a pen or a table ?"
74 ,"The girl is a human."
75 ,"The boy is a human."
76 ,"The boy or the girl are human."
80 -- | Control the flow of text
81 data TextFlow = CSV FilePath
84 | DBV3 Connection CorpusId
88 textFlow :: TermType Lang -> TextFlow -> IO Graph
89 textFlow termType workType = do
90 contexts <- case workType of
91 FullText path -> splitBy (Sentences 5) <$> readFile path
92 CSV path -> readCsvOn [csv_title, csv_abstract] path
93 Contexts ctxt -> pure ctxt
94 DBV3 con corpusId -> catMaybes <$> map (\n -> hyperdataDocumentV3_title (_node_hyperdata n) <> hyperdataDocumentV3_abstract (_node_hyperdata n))<$> runReaderT (getDocumentsV3WithParentId corpusId) con
95 _ -> undefined -- TODO Query not supported
97 textFlow' termType contexts
100 textFlow' :: TermType Lang -> [T.Text] -> IO Graph
101 textFlow' termType contexts = do
102 -- Context :: Text -> [Text]
103 -- Contexts = Paragraphs n | Sentences n | Chars n
105 myterms <- extractTerms termType contexts
106 -- TermsType = Mono | Multi | MonoMulti
107 -- myterms # filter (\t -> not . elem t stopList)
108 -- # groupBy (Stem|GroupList|Ontology)
109 printDebug "terms" myterms
110 printDebug "myterms" (sum $ map length myterms)
112 -- Bulding the map list
113 -- compute copresences of terms, i.e. cooccurrences of terms in same context of text
114 -- Cooc = Map (Term, Term) Int
115 let myCooc1 = cooc myterms
116 printDebug "myCooc1 size" (M.size myCooc1)
118 -- Remove Apax: appears one time only => lighting the matrix
119 let myCooc2 = M.filter (>0) myCooc1
120 printDebug "myCooc2 size" (M.size myCooc2)
121 printDebug "myCooc2" myCooc2
122 g <- cooc2graph myCooc2
125 -- TODO use Text only here instead of [Text]
126 cooc2graph :: (Map ([Text], [Text]) Int) -> IO Graph
127 cooc2graph myCooc = do
129 -- Filtering terms with inclusion/Exclusion and Specificity/Genericity scores
130 let myCooc3 = filterCooc ( FilterConfig (MapListSize 350 )
136 printDebug "myCooc3 size" $ M.size myCooc3
137 printDebug "myCooc3" myCooc3
140 let (ti, _) = createIndices myCooc3
141 printDebug "ti size" $ M.size ti
144 let myCooc4 = toIndex ti myCooc3
145 printDebug "myCooc4 size" $ M.size myCooc4
146 printDebug "myCooc4" myCooc4
148 let matCooc = map2mat (0) (M.size ti) myCooc4
149 printDebug "matCooc shape" $ A.arrayShape matCooc
150 printDebug "matCooc" matCooc
152 -- Matrix -> Clustering
153 let distanceMat = measureConditional matCooc
154 --let distanceMat = distributional matCooc
155 printDebug "distanceMat shape" $ A.arrayShape distanceMat
156 printDebug "distanceMat" distanceMat
158 --let distanceMap = M.filter (>0) $ mat2map distanceMat
159 let distanceMap = M.map (\_ -> 1) $ M.filter (>0) $ mat2map distanceMat
160 printDebug "distanceMap size" $ M.size distanceMap
161 printDebug "distanceMap" distanceMap
163 -- let distance = fromIndex fi distanceMap
164 -- printDebug "distance" $ M.size distance
166 partitions <- cLouvain distanceMap
167 -- Building : -> Graph -> JSON
168 printDebug "partitions" $ DS.size $ DS.fromList $ map (l_community_id) partitions
169 --printDebug "partitions" partitions
170 pure $ data2graph (M.toList ti) myCooc4 distanceMap partitions