]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Flow.hs
Merge branch 'dev-ngrams-repo' of ssh://delanoe.org/haskell-gargantext into dev-ngram...
[gargantext.git] / src / Gargantext / Text / Flow.hs
1 {-|
2 Module : Gargantext.Text.Flow
3 Description : Server API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 From text to viz, all the flow of texts in Gargantext.
11
12 -}
13
14 {-# OPTIONS_GHC -fno-warn-name-shadowing #-}
15 {-# LANGUAGE NoImplicitPrelude #-}
16 {-# LANGUAGE OverloadedStrings #-}
17
18 module Gargantext.Text.Flow
19 where
20
21 --import qualified Data.Array.Accelerate as A
22 --import qualified Data.Set as DS
23 import Control.Monad.Reader
24 import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain)
25 import Data.Map.Strict (Map)
26 import Data.Maybe (catMaybes)
27 import qualified Data.Map.Strict as M
28 import qualified Data.Text as T
29 import Data.Text (Text)
30 import Data.Text.IO (readFile)
31 import Database.PostgreSQL.Simple (Connection)
32 import GHC.IO (FilePath)
33 import Gargantext.Core (Lang)
34 import Gargantext.Core.Types (CorpusId)
35 import Gargantext.Database.Schema.Node
36 import Gargantext.Database.Types.Node
37 import Gargantext.Prelude
38 import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
39 import Gargantext.Text.Metrics (filterCooc, FilterConfig(..), Clusters(..), SampleBins(..), DefaultValue(..), MapListSize(..), InclusionSize(..))
40 import Gargantext.Text.Metrics.Count (cooc)
41 import Gargantext.Text.Parsers.CSV
42 import Gargantext.Text.Terms (TermType, extractTerms)
43 import Gargantext.Viz.Graph (Graph(..), data2graph)
44 import Gargantext.Viz.Graph.Bridgeness (bridgeness)
45 import Gargantext.Viz.Graph.Distances.Matrice (measureConditional)
46 --import Gargantext.Viz.Graph.Distances.Matrice (distributional)
47 import Gargantext.Viz.Graph.Index (createIndices, toIndex, map2mat, mat2map)
48 {-
49 ____ _ _
50 / ___| __ _ _ __ __ _ __ _ _ __ | |_ _____ _| |_
51 | | _ / _` | '__/ _` |/ _` | '_ \| __/ _ \ \/ / __|
52 | |_| | (_| | | | (_| | (_| | | | | || __/> <| |_
53 \____|\__,_|_| \__, |\__,_|_| |_|\__\___/_/\_\\__|
54 |___/
55 -}
56
57
58 contextText :: [T.Text]
59 contextText = ["The dog is an animal."
60 ,"The bird is an animal."
61 ,"The dog is an animal."
62 ,"The animal is a bird or a dog ?"
63 ,"The table is an object."
64 ,"The pen is an object."
65 ,"The object is a pen or a table ?"
66 ,"The girl is a human."
67 ,"The boy is a human."
68 ,"The boy or the girl are human."
69 ]
70
71
72 -- | Control the flow of text
73 data TextFlow = CSV FilePath
74 | FullText FilePath
75 | Contexts [T.Text]
76 | DBV3 Connection CorpusId
77 | Query T.Text
78
79
80 textFlow :: TermType Lang -> TextFlow -> IO Graph
81 textFlow termType workType = do
82 contexts <- case workType of
83 FullText path -> splitBy (Sentences 5) <$> readFile path
84 CSV path -> readCsvOn [csv_title, csv_abstract] path
85 Contexts ctxt -> pure ctxt
86 DBV3 con corpusId -> catMaybes <$> map (\n -> hyperdataDocumentV3_title (_node_hyperdata n) <> hyperdataDocumentV3_abstract (_node_hyperdata n))<$> runReaderT (getDocumentsV3WithParentId corpusId) con
87 _ -> undefined -- TODO Query not supported
88
89 textFlow' termType contexts
90
91
92 textFlow' :: TermType Lang -> [T.Text] -> IO Graph
93 textFlow' termType contexts = do
94 -- Context :: Text -> [Text]
95 -- Contexts = Paragraphs n | Sentences n | Chars n
96
97 myterms <- extractTerms termType contexts
98 -- TermsType = Mono | Multi | MonoMulti
99 -- myterms # filter (\t -> not . elem t stopList)
100 -- # groupBy (Stem|GroupList|Ontology)
101 --printDebug "terms" myterms
102 --printDebug "myterms" (sum $ map length myterms)
103
104 -- Bulding the map list
105 -- compute copresences of terms, i.e. cooccurrences of terms in same context of text
106 -- Cooc = Map (Term, Term) Int
107 let myCooc1 = cooc myterms
108 --printDebug "myCooc1 size" (M.size myCooc1)
109
110 -- Remove Apax: appears one time only => lighting the matrix
111 let myCooc2 = M.filter (>0) myCooc1
112 --printDebug "myCooc2 size" (M.size myCooc2)
113 --printDebug "myCooc2" myCooc2
114 g <- cooc2graph myCooc2
115 pure g
116
117 -- TODO use Text only here instead of [Text]
118 cooc2graph :: (Map ([Text], [Text]) Int) -> IO Graph
119 cooc2graph myCooc = do
120 --printDebug "myCooc" myCooc
121 -- Filtering terms with inclusion/Exclusion and Specificity/Genericity scores
122 let myCooc3 = filterCooc ( FilterConfig (MapListSize 350 )
123 (InclusionSize 500 )
124 (SampleBins 10 )
125 (Clusters 3 )
126 (DefaultValue 0 )
127 ) myCooc
128 --printDebug "myCooc3 size" $ M.size myCooc3
129 --printDebug "myCooc3" myCooc3
130
131 -- Cooc -> Matrix
132 let (ti, _) = createIndices myCooc3
133 --printDebug "ti size" $ M.size ti
134 --printDebug "ti" ti
135
136 let myCooc4 = toIndex ti myCooc3
137 --printDebug "myCooc4 size" $ M.size myCooc4
138 --printDebug "myCooc4" myCooc4
139
140 let matCooc = map2mat (0) (M.size ti) myCooc4
141 --printDebug "matCooc shape" $ A.arrayShape matCooc
142 --printDebug "matCooc" matCooc
143
144 -- Matrix -> Clustering
145 let distanceMat = measureConditional matCooc
146 --let distanceMat = distributional matCooc
147 --printDebug "distanceMat shape" $ A.arrayShape distanceMat
148 --printDebug "distanceMat" distanceMat
149
150 --let distanceMap = M.filter (>0) $ mat2map distanceMat
151 let distanceMap = M.map (\_ -> 1) $ M.filter (>0) $ mat2map distanceMat
152 --printDebug "distanceMap size" $ M.size distanceMap
153 --printDebug "distanceMap" distanceMap
154
155 -- let distance = fromIndex fi distanceMap
156 --printDebug "distance" $ M.size distance
157
158 partitions <- case M.size distanceMap > 0 of
159 True -> cLouvain distanceMap
160 False -> panic "Text.Flow: DistanceMap is empty"
161 -- Building : -> Graph -> JSON
162 --printDebug "partitions" $ DS.size $ DS.fromList $ map (l_community_id) partitions
163 --printDebug "partitions" partitions
164 let distanceMap' = bridgeness 300 partitions distanceMap
165 pure $ data2graph (M.toList ti) myCooc4 distanceMap' partitions
166