2 Module : Gargantext.Text.Flow
3 Description : Server API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 From text to viz, all the flow of texts in Gargantext.
14 {-# OPTIONS_GHC -fno-warn-name-shadowing #-}
15 {-# LANGUAGE NoImplicitPrelude #-}
16 {-# LANGUAGE OverloadedStrings #-}
18 module Gargantext.Text.Flow
21 import Control.Monad.Reader
22 import GHC.IO (FilePath)
23 import qualified Data.Text as T
24 import Data.Text.IO (readFile)
26 import Data.Maybe (catMaybes)
27 import qualified Data.Set as DS
29 import qualified Data.Array.Accelerate as A
30 import qualified Data.Map.Strict as M
31 ----------------------------------------------
32 import Database.PostgreSQL.Simple (Connection)
34 import Gargantext.Database.Schema.Node
35 import Gargantext.Database.Types.Node
37 import Gargantext.Core (Lang)
38 import Gargantext.Prelude
40 import Gargantext.Viz.Graph.Index (createIndices, toIndex, map2mat, mat2map)
41 import Gargantext.Viz.Graph.Distances.Matrice (measureConditional)
42 import Gargantext.Viz.Graph (Graph(..), data2graph)
43 import Gargantext.Text.Metrics.Count (cooc)
44 import Gargantext.Text.Metrics (filterCooc, FilterConfig(..), Clusters(..), SampleBins(..), DefaultValue(..), MapListSize(..), InclusionSize(..))
45 import Gargantext.Text.Terms (TermType, extractTerms)
46 import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
47 import Gargantext.Core.Types (CorpusId)
49 import Gargantext.Text.Parsers.CSV
51 import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain, l_community_id)
55 / ___| __ _ _ __ __ _ __ _ _ __ | |_ _____ _| |_
56 | | _ / _` | '__/ _` |/ _` | '_ \| __/ _ \ \/ / __|
57 | |_| | (_| | | | (_| | (_| | | | | || __/> <| |_
58 \____|\__,_|_| \__, |\__,_|_| |_|\__\___/_/\_\\__|
63 contextText :: [T.Text]
64 contextText = ["The dog is an animal."
65 ,"The bird is an animal."
66 ,"The dog is an animal."
67 ,"The animal is a bird or a dog ?"
68 ,"The table is an object."
69 ,"The pen is an object."
70 ,"The object is a pen or a table ?"
71 ,"The girl is a human."
72 ,"The boy is a human."
73 ,"The boy or the girl are human."
77 -- | Control the flow of text
78 data TextFlow = CSV FilePath
81 | DBV3 Connection CorpusId
85 textFlow :: TermType Lang -> TextFlow -> IO Graph
86 textFlow termType workType = do
87 contexts <- case workType of
88 FullText path -> splitBy (Sentences 5) <$> readFile path
89 CSV path -> readCsvOn [csv_title, csv_abstract] path
90 Contexts ctxt -> pure ctxt
91 DBV3 con corpusId -> catMaybes <$> map (\n -> hyperdataDocumentV3_title (_node_hyperdata n) <> hyperdataDocumentV3_abstract (_node_hyperdata n))<$> runReaderT (getDocumentsV3WithParentId corpusId) con
92 _ -> undefined -- TODO Query not supported
94 textFlow' termType contexts
97 textFlow' :: TermType Lang -> [T.Text] -> IO Graph
98 textFlow' termType contexts = do
99 -- Context :: Text -> [Text]
100 -- Contexts = Paragraphs n | Sentences n | Chars n
102 myterms <- extractTerms termType contexts
103 -- TermsType = Mono | Multi | MonoMulti
104 -- myterms # filter (\t -> not . elem t stopList)
105 -- # groupBy (Stem|GroupList|Ontology)
106 printDebug "terms" myterms
107 printDebug "myterms" (sum $ map length myterms)
109 -- Bulding the map list
110 -- compute copresences of terms, i.e. cooccurrences of terms in same context of text
111 -- Cooc = Map (Term, Term) Int
112 let myCooc1 = cooc myterms
113 printDebug "myCooc1 size" (M.size myCooc1)
115 -- Remove Apax: appears one time only => lighting the matrix
116 let myCooc2 = M.filter (>0) myCooc1
117 printDebug "myCooc2 size" (M.size myCooc2)
118 printDebug "myCooc2" myCooc2
121 -- Filtering terms with inclusion/Exclusion and Specificity/Genericity scores
122 let myCooc3 = filterCooc ( FilterConfig (MapListSize 350 )
128 printDebug "myCooc3 size" $ M.size myCooc3
129 printDebug "myCooc3" myCooc3
132 let (ti, _) = createIndices myCooc3
133 printDebug "ti size" $ M.size ti
136 let myCooc4 = toIndex ti myCooc3
137 printDebug "myCooc4 size" $ M.size myCooc4
138 printDebug "myCooc4" myCooc4
140 let matCooc = map2mat (0) (M.size ti) myCooc4
141 printDebug "matCooc shape" $ A.arrayShape matCooc
142 printDebug "matCooc" matCooc
144 -- Matrix -> Clustering
145 let distanceMat = measureConditional matCooc
146 --let distanceMat = distributional matCooc
147 printDebug "distanceMat shape" $ A.arrayShape distanceMat
148 printDebug "distanceMat" distanceMat
150 --let distanceMap = M.filter (>0) $ mat2map distanceMat
151 let distanceMap = M.map (\_ -> 1) $ M.filter (>0) $ mat2map distanceMat
152 printDebug "distanceMap size" $ M.size distanceMap
153 printDebug "distanceMap" distanceMap
155 -- let distance = fromIndex fi distanceMap
156 -- printDebug "distance" $ M.size distance
158 partitions <- cLouvain distanceMap
159 -- Building : -> Graph -> JSON
160 printDebug "partitions" $ DS.size $ DS.fromList $ map (l_community_id) partitions
161 --printDebug "partitions" partitions
162 pure $ data2graph (M.toList ti) myCooc4 distanceMap partitions