]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Flow.hs
[TEXT][PARSERS] Isidore.
[gargantext.git] / src / Gargantext / Text / Flow.hs
1 {-|
2 Module : Gargantext.Text.Flow
3 Description : Server API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 From text to viz, all the flow of texts in Gargantext.
11
12 -}
13
14 {-# OPTIONS_GHC -fno-warn-name-shadowing #-}
15 {-# LANGUAGE NoImplicitPrelude #-}
16 {-# LANGUAGE OverloadedStrings #-}
17
18 module Gargantext.Text.Flow
19 where
20
21 import Control.Monad.Reader
22 import GHC.IO (FilePath)
23 import qualified Data.Text as T
24 import Data.Text.IO (readFile)
25
26 import Data.Maybe (catMaybes)
27 import qualified Data.Set as DS
28
29 import qualified Data.Array.Accelerate as A
30 import qualified Data.Map.Strict as M
31 ----------------------------------------------
32 import Database.PostgreSQL.Simple (Connection)
33
34 import Gargantext.Database.Schema.Node
35 import Gargantext.Database.Types.Node
36
37 import Gargantext.Core (Lang)
38 import Gargantext.Prelude
39
40 import Gargantext.Viz.Graph.Index (createIndices, toIndex, map2mat, mat2map)
41 import Gargantext.Viz.Graph.Distances.Matrice (measureConditional)
42 import Gargantext.Viz.Graph (Graph(..), data2graph)
43 import Gargantext.Text.Metrics.Count (cooc)
44 import Gargantext.Text.Metrics (filterCooc, FilterConfig(..), Clusters(..), SampleBins(..), DefaultValue(..), MapListSize(..), InclusionSize(..))
45 import Gargantext.Text.Terms (TermType, extractTerms)
46 import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
47 import Gargantext.Core.Types (CorpusId)
48
49 import Gargantext.Text.Parsers.CSV
50
51 import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain, l_community_id)
52
53 {-
54 ____ _ _
55 / ___| __ _ _ __ __ _ __ _ _ __ | |_ _____ _| |_
56 | | _ / _` | '__/ _` |/ _` | '_ \| __/ _ \ \/ / __|
57 | |_| | (_| | | | (_| | (_| | | | | || __/> <| |_
58 \____|\__,_|_| \__, |\__,_|_| |_|\__\___/_/\_\\__|
59 |___/
60 -}
61
62
63 contextText :: [T.Text]
64 contextText = ["The dog is an animal."
65 ,"The bird is an animal."
66 ,"The dog is an animal."
67 ,"The animal is a bird or a dog ?"
68 ,"The table is an object."
69 ,"The pen is an object."
70 ,"The object is a pen or a table ?"
71 ,"The girl is a human."
72 ,"The boy is a human."
73 ,"The boy or the girl are human."
74 ]
75
76
77 -- | Control the flow of text
78 data TextFlow = CSV FilePath
79 | FullText FilePath
80 | Contexts [T.Text]
81 | DBV3 Connection CorpusId
82 | Query T.Text
83
84
85 textFlow :: TermType Lang -> TextFlow -> IO Graph
86 textFlow termType workType = do
87 contexts <- case workType of
88 FullText path -> splitBy (Sentences 5) <$> readFile path
89 CSV path -> readCsvOn [csv_title, csv_abstract] path
90 Contexts ctxt -> pure ctxt
91 DBV3 con corpusId -> catMaybes <$> map (\n -> hyperdataDocumentV3_title (_node_hyperdata n) <> hyperdataDocumentV3_abstract (_node_hyperdata n))<$> runReaderT (getDocumentsV3WithParentId corpusId) con
92 _ -> undefined -- TODO Query not supported
93
94 textFlow' termType contexts
95
96
97 textFlow' :: TermType Lang -> [T.Text] -> IO Graph
98 textFlow' termType contexts = do
99 -- Context :: Text -> [Text]
100 -- Contexts = Paragraphs n | Sentences n | Chars n
101
102 myterms <- extractTerms termType contexts
103 -- TermsType = Mono | Multi | MonoMulti
104 -- myterms # filter (\t -> not . elem t stopList)
105 -- # groupBy (Stem|GroupList|Ontology)
106 printDebug "terms" myterms
107 printDebug "myterms" (sum $ map length myterms)
108
109 -- Bulding the map list
110 -- compute copresences of terms, i.e. cooccurrences of terms in same context of text
111 -- Cooc = Map (Term, Term) Int
112 let myCooc1 = cooc myterms
113 printDebug "myCooc1 size" (M.size myCooc1)
114
115 -- Remove Apax: appears one time only => lighting the matrix
116 let myCooc2 = M.filter (>0) myCooc1
117 printDebug "myCooc2 size" (M.size myCooc2)
118 printDebug "myCooc2" myCooc2
119
120
121 -- Filtering terms with inclusion/Exclusion and Specificity/Genericity scores
122 let myCooc3 = filterCooc ( FilterConfig (MapListSize 350 )
123 (InclusionSize 500 )
124 (SampleBins 10 )
125 (Clusters 3 )
126 (DefaultValue 0 )
127 ) myCooc2
128 printDebug "myCooc3 size" $ M.size myCooc3
129 printDebug "myCooc3" myCooc3
130
131 -- Cooc -> Matrix
132 let (ti, _) = createIndices myCooc3
133 printDebug "ti size" $ M.size ti
134 printDebug "ti" ti
135
136 let myCooc4 = toIndex ti myCooc3
137 printDebug "myCooc4 size" $ M.size myCooc4
138 printDebug "myCooc4" myCooc4
139
140 let matCooc = map2mat (0) (M.size ti) myCooc4
141 printDebug "matCooc shape" $ A.arrayShape matCooc
142 printDebug "matCooc" matCooc
143
144 -- Matrix -> Clustering
145 let distanceMat = measureConditional matCooc
146 --let distanceMat = distributional matCooc
147 printDebug "distanceMat shape" $ A.arrayShape distanceMat
148 printDebug "distanceMat" distanceMat
149 --
150 --let distanceMap = M.filter (>0) $ mat2map distanceMat
151 let distanceMap = M.map (\_ -> 1) $ M.filter (>0) $ mat2map distanceMat
152 printDebug "distanceMap size" $ M.size distanceMap
153 printDebug "distanceMap" distanceMap
154
155 -- let distance = fromIndex fi distanceMap
156 -- printDebug "distance" $ M.size distance
157
158 partitions <- cLouvain distanceMap
159 -- Building : -> Graph -> JSON
160 printDebug "partitions" $ DS.size $ DS.fromList $ map (l_community_id) partitions
161 --printDebug "partitions" partitions
162 pure $ data2graph (M.toList ti) myCooc4 distanceMap partitions
163