]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/TextFlow.hs
[TEST] Adding Example Text for QTV (Qualitative Test Validation).
[gargantext.git] / src / Gargantext / TextFlow.hs
1 {-|
2 Module : Gargantext.TextFlow
3 Description : Server API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 From text to viz, all the flow of texts in Gargantext.
11
12 -}
13
14 {-# OPTIONS_GHC -fno-warn-name-shadowing #-}
15 {-# LANGUAGE NoImplicitPrelude #-}
16
17 module Gargantext.TextFlow
18 where
19
20 import GHC.IO (FilePath)
21 import qualified Data.Text as T
22 import Data.Text.IO (readFile)
23
24 import Control.Monad.IO.Class (MonadIO)
25
26 import Data.Map.Strict (Map)
27 import qualified Data.Array.Accelerate as A
28 import qualified Data.Map.Strict as M
29 ----------------------------------------------
30 import Gargantext.Core (Lang(FR))
31 import Gargantext.Core.Types (Label)
32 import Gargantext.Prelude
33
34 import Gargantext.Viz.Graph.Index (createIndices, toIndex, map2mat, mat2map)
35 import Gargantext.Viz.Graph.Distances.Matrice (conditional)
36 import Gargantext.Viz.Graph (Graph(..), Node(..), Edge(..), Attributes(..), TypeNode(..))
37 import Gargantext.Text.Metrics.Count (cooc)
38 import Gargantext.Text.Metrics
39 import Gargantext.Text.Terms (TermType(Mono), extractTerms)
40 import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
41
42 import Gargantext.Text.Parsers.CSV
43
44 import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain, LouvainNode(..))
45
46
47 {-
48 ____ _ _
49 / ___| __ _ _ __ __ _ __ _ _ __ | |_ _____ _| |_
50 | | _ / _` | '__/ _` |/ _` | '_ \| __/ _ \ \/ / __|
51 | |_| | (_| | | | (_| | (_| | | | | || __/> <| |_
52 \____|\__,_|_| \__, |\__,_|_| |_|\__\___/_/\_\\__|
53 |___/
54 -}
55
56 printDebug :: (Show a, MonadIO m) => [Char] -> a -> m ()
57 printDebug msg x = putStrLn $ msg <> " " <> show x
58 --printDebug _ _ = pure ()
59
60 data TextFlow = CSV | FullText
61
62 -- workflow :: Lang (EN|FR) -> FilePath -> Graph
63 textflow :: Lang -> TextFlow -> FilePath -> IO Graph
64 textflow _ workType path = do
65 -- Text <- IO Text <- FilePath
66 contexts <- case workType of
67 FullText -> splitBy (Sentences 5) <$> readFile path
68 CSV -> readCsvOn [csv_title, csv_abstract] path
69
70 -- Context :: Text -> [Text]
71 -- Contexts = Paragraphs n | Sentences n | Chars n
72
73 myterms <- extractTerms (Mono FR) contexts
74 -- TermsType = Mono | Multi | MonoMulti
75 -- myterms # filter (\t -> not . elem t stopList)
76 -- # groupBy (Stem|GroupList|Ontology)
77 printDebug "myterms" (sum $ map length myterms)
78
79 -- Bulding the map list
80 -- compute copresences of terms
81 -- Cooc = Map (Term, Term) Int
82 let myCooc1 = cooc myterms
83 printDebug "myCooc1" (M.size myCooc1)
84
85 -- Remove Apax: appears one time only => lighting the matrix
86 let myCooc2 = M.filter (>1) myCooc1
87 printDebug "myCooc2" (M.size myCooc2)
88
89 -- Filtering terms with inclusion/Exclusion and Specifity/Genericity scores
90 let myCooc3 = filterCooc ( FilterConfig (MapListSize 1000 )
91 (InclusionSize 4000 )
92 (SampleBins 10 )
93 (Clusters 3 )
94 (DefaultValue 0 )
95 ) myCooc2
96 printDebug "myCooc3" $ M.size myCooc3
97
98 -- Cooc -> Matrix
99 let (ti, _) = createIndices myCooc3
100 printDebug "ti" $ M.size ti
101
102 let myCooc4 = toIndex ti myCooc3
103 printDebug "myCooc4" $ M.size myCooc4
104
105 let matCooc = map2mat (0) (M.size ti) myCooc4
106 --printDebug "matCooc" matCooc
107 -- Matrix -> Clustering
108 let distanceMat = conditional matCooc
109 -- let distanceMat = distributional matCooc
110 printDebug "distanceMat" $ A.arrayShape distanceMat
111 --printDebug "distanceMat" distanceMat
112 --
113 let distanceMap = mat2map distanceMat
114 printDebug "distanceMap" $ M.size distanceMap
115 --{-
116 -- let distance = fromIndex fi distanceMap
117 -- printDebug "distance" $ M.size distance
118 ---}
119 partitions <- cLouvain distanceMap
120 ------ | Building : -> Graph -> JSON
121 printDebug "partitions" $ length partitions
122 --printDebug "partitions" partitions
123 pure $ data2graph (M.toList ti) myCooc4 distanceMap partitions
124
125
126 -----------------------------------------------------------
127 -- distance should not be a map since we just "toList" it (same as cLouvain)
128 data2graph :: [(Label, Int)] -> Map (Int, Int) Int
129 -> Map (Int, Int) Double
130 -> [LouvainNode]
131 -> Graph
132 data2graph labels coocs distance partitions = Graph nodes edges
133 where
134 community_id_by_node_id = M.fromList [ (n, c) | LouvainNode n c <- partitions ]
135 nodes = [ Node { node_size = maybe 0 identity (M.lookup (n,n) coocs)
136 , node_type = Terms -- or Unknown
137 , node_id = cs (show n)
138 , node_label = T.unwords l
139 , node_attributes =
140 Attributes { clust_default = maybe 0 identity
141 (M.lookup n community_id_by_node_id) } }
142 | (l, n) <- labels ]
143 edges = [ Edge { edge_source = cs (show s)
144 , edge_target = cs (show t)
145 , edge_weight = w
146 , edge_id = cs (show i) }
147 | (i, ((s,t), w)) <- zip ([0..]::[Integer]) (M.toList distance) ]
148 -----------------------------------------------------------
149