]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/TextFlow.hs
[DBFLOW] getViewDocument + uniqIdBdd (enables duplicatas from different Database...
[gargantext.git] / src / Gargantext / TextFlow.hs
1 {-|
2 Module : Gargantext.TextFlow
3 Description : Server API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 From text to viz, all the flow of texts in Gargantext.
11
12 -}
13
14 {-# OPTIONS_GHC -fno-warn-name-shadowing #-}
15 {-# LANGUAGE NoImplicitPrelude #-}
16
17 module Gargantext.TextFlow
18 where
19
20 import GHC.IO (FilePath)
21 import qualified Data.Text as T
22 import Data.Text.IO (readFile)
23
24 import Data.Maybe (catMaybes)
25 import qualified Data.Set as DS
26
27 import qualified Data.Array.Accelerate as A
28 import qualified Data.Map.Strict as M
29 ----------------------------------------------
30 import Gargantext.Database (Connection)
31
32 import Gargantext.Database.Node
33 import Gargantext.Database.Types.Node
34
35 import Gargantext.Core (Lang)
36 import Gargantext.Prelude
37
38 import Gargantext.Viz.Graph.Index (createIndices, toIndex, map2mat, mat2map)
39 import Gargantext.Viz.Graph.Distances.Matrice (measureConditional)
40 import Gargantext.Viz.Graph (Graph(..), data2graph)
41 import Gargantext.Text.Metrics.Count (cooc)
42 import Gargantext.Text.Metrics (filterCooc, FilterConfig(..), Clusters(..), SampleBins(..), DefaultValue(..), MapListSize(..), InclusionSize(..))
43 import Gargantext.Text.Terms (TermType, extractTerms)
44 import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
45
46 import Gargantext.Text.Parsers.CSV
47
48 import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain, l_community_id)
49
50 {-
51 ____ _ _
52 / ___| __ _ _ __ __ _ __ _ _ __ | |_ _____ _| |_
53 | | _ / _` | '__/ _` |/ _` | '_ \| __/ _ \ \/ / __|
54 | |_| | (_| | | | (_| | (_| | | | | || __/> <| |_
55 \____|\__,_|_| \__, |\__,_|_| |_|\__\___/_/\_\\__|
56 |___/
57 -}
58
59
60 contextText :: [T.Text]
61 contextText = map T.pack ["The dog is an animal."
62 ,"The bird is a animal."
63 ,"The dog is a animal."
64 ,"The animal is a bird or a dog ?"
65 ,"The table is a object."
66 ,"The pen is a object."
67 ,"The object is a pen or a table ?"
68 ,"The girl is human body."
69 ,"The boy is human body."
70 ,"The boy or the girl are human body."
71 ]
72
73
74 -- | Control the flow of text
75 data TextFlow = CSV FilePath
76 | FullText FilePath
77 | Contexts [T.Text]
78 | DB Connection CorpusId
79 | Query T.Text
80
81
82 textFlow :: TermType Lang -> TextFlow -> IO Graph
83 textFlow termType workType = do
84 contexts <- case workType of
85 FullText path -> splitBy (Sentences 5) <$> readFile path
86 CSV path -> readCsvOn [csv_title, csv_abstract] path
87 Contexts ctxt -> pure ctxt
88 DB con corpusId -> catMaybes <$> map (\n -> hyperdataDocumentV3_title (_node_hyperdata n) <> hyperdataDocumentV3_abstract (_node_hyperdata n))<$> getDocumentsV3WithParentId con corpusId
89 _ -> undefined -- TODO Query not supported
90
91 textFlow' termType contexts
92
93
94 textFlow' :: TermType Lang -> [T.Text] -> IO Graph
95 textFlow' termType contexts = do
96 -- Context :: Text -> [Text]
97 -- Contexts = Paragraphs n | Sentences n | Chars n
98
99 myterms <- extractTerms termType contexts
100 -- TermsType = Mono | Multi | MonoMulti
101 -- myterms # filter (\t -> not . elem t stopList)
102 -- # groupBy (Stem|GroupList|Ontology)
103 printDebug "terms" myterms
104 printDebug "myterms" (sum $ map length myterms)
105
106 -- Bulding the map list
107 -- compute copresences of terms, i.e. cooccurrences of terms in same context of text
108 -- Cooc = Map (Term, Term) Int
109 let myCooc1 = cooc myterms
110 printDebug "myCooc1 size" (M.size myCooc1)
111
112 -- Remove Apax: appears one time only => lighting the matrix
113 let myCooc2 = M.filter (>0) myCooc1
114 printDebug "myCooc2 size" (M.size myCooc2)
115 printDebug "myCooc2" myCooc2
116
117
118 -- Filtering terms with inclusion/Exclusion and Specificity/Genericity scores
119 let myCooc3 = filterCooc ( FilterConfig (MapListSize 350 )
120 (InclusionSize 500 )
121 (SampleBins 10 )
122 (Clusters 3 )
123 (DefaultValue 0 )
124 ) myCooc2
125 printDebug "myCooc3 size" $ M.size myCooc3
126 printDebug "myCooc3" myCooc3
127
128 -- Cooc -> Matrix
129 let (ti, _) = createIndices myCooc3
130 printDebug "ti size" $ M.size ti
131 printDebug "ti" ti
132
133 let myCooc4 = toIndex ti myCooc3
134 printDebug "myCooc4 size" $ M.size myCooc4
135 printDebug "myCooc4" myCooc4
136
137 let matCooc = map2mat (0) (M.size ti) myCooc4
138 printDebug "matCooc shape" $ A.arrayShape matCooc
139 printDebug "matCooc" matCooc
140
141 -- Matrix -> Clustering
142 let distanceMat = measureConditional matCooc
143 --let distanceMat = distributional matCooc
144 printDebug "distanceMat shape" $ A.arrayShape distanceMat
145 printDebug "distanceMat" distanceMat
146 --
147 --let distanceMap = M.filter (>0) $ mat2map distanceMat
148 let distanceMap = M.map (\_ -> 1) $ M.filter (>0) $ mat2map distanceMat
149 printDebug "distanceMap size" $ M.size distanceMap
150 printDebug "distanceMap" distanceMap
151
152 -- let distance = fromIndex fi distanceMap
153 -- printDebug "distance" $ M.size distance
154
155 partitions <- cLouvain distanceMap
156 -- Building : -> Graph -> JSON
157 printDebug "partitions" $ DS.size $ DS.fromList $ map (l_community_id) partitions
158 --printDebug "partitions" partitions
159 pure $ data2graph (M.toList ti) myCooc4 distanceMap partitions
160