]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/TextFlow.hs
[VERSION + DOC] starting the versionning from 0. Adding API modules to the doc.
[gargantext.git] / src / Gargantext / TextFlow.hs
1 {-|
2 Module : Gargantext.TextFlow
3 Description : Server API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 From text to viz, all the flow of texts in Gargantext.
11
12 -}
13
14 {-# OPTIONS_GHC -fno-warn-name-shadowing #-}
15 {-# LANGUAGE NoImplicitPrelude #-}
16
17 module Gargantext.TextFlow
18 where
19
20 import GHC.IO (FilePath)
21 import qualified Data.Text as T
22 import Data.Text.IO (readFile)
23
24 import Control.Monad.IO.Class (MonadIO)
25
26 import Data.Map.Strict (Map)
27 import qualified Data.Array.Accelerate as A
28 import qualified Data.Map.Strict as M
29 ----------------------------------------------
30 import Gargantext.Core (Lang)
31 import Gargantext.Core.Types (Label)
32 import Gargantext.Prelude
33
34 import Gargantext.Viz.Graph.Index (createIndices, toIndex, map2mat, mat2map)
35 import Gargantext.Viz.Graph.Distances.Matrice (conditional)
36 import Gargantext.Viz.Graph (Graph(..), Node(..), Edge(..), Attributes(..), TypeNode(..))
37 import Gargantext.Text.Metrics.Count (cooc)
38 import Gargantext.Text.Metrics
39 import Gargantext.Text.Terms (TermType, extractTerms)
40 import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
41
42 import Gargantext.Text.Parsers.CSV
43
44 import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain, LouvainNode(..))
45
46
47 {-
48 ____ _ _
49 / ___| __ _ _ __ __ _ __ _ _ __ | |_ _____ _| |_
50 | | _ / _` | '__/ _` |/ _` | '_ \| __/ _ \ \/ / __|
51 | |_| | (_| | | | (_| | (_| | | | | || __/> <| |_
52 \____|\__,_|_| \__, |\__,_|_| |_|\__\___/_/\_\\__|
53 |___/
54 -}
55
56 printDebug :: (Show a, MonadIO m) => [Char] -> a -> m ()
57 printDebug msg x = putStrLn $ msg <> " " <> show x
58 -- printDebug _ _ = pure ()
59
60 data TextFlow = CSV FilePath
61 | FullText FilePath
62 | Contexts [T.Text]
63 | SQL Int
64 | Database T.Text
65 -- ExtDatabase Query
66 -- IntDatabase NodeId
67
68 textFlow :: TermType Lang -> TextFlow -> IO Graph
69 textFlow termType workType = do
70 contexts <- case workType of
71 FullText path -> splitBy (Sentences 5) <$> readFile path
72 CSV path -> readCsvOn [csv_title, csv_abstract] path
73 Contexts ctxt -> pure ctxt
74 _ -> undefined
75
76 textFlow' termType contexts
77
78
79 textFlow' :: TermType Lang -> [T.Text] -> IO Graph
80 textFlow' termType contexts = do
81 -- Context :: Text -> [Text]
82 -- Contexts = Paragraphs n | Sentences n | Chars n
83
84 myterms <- extractTerms termType contexts
85 -- TermsType = Mono | Multi | MonoMulti
86 -- myterms # filter (\t -> not . elem t stopList)
87 -- # groupBy (Stem|GroupList|Ontology)
88 printDebug "myterms" (sum $ map length myterms)
89
90 -- Bulding the map list
91 -- compute copresences of terms, i.e. cooccurrences of terms in same context of text
92 -- Cooc = Map (Term, Term) Int
93 let myCooc1 = cooc myterms
94 printDebug "myCooc1" (M.size myCooc1)
95
96 -- Remove Apax: appears one time only => lighting the matrix
97 let myCooc2 = M.filter (>1) myCooc1
98 printDebug "myCooc2" (M.size myCooc2)
99
100 -- Filtering terms with inclusion/Exclusion and Specifity/Genericity scores
101 let myCooc3 = filterCooc ( FilterConfig (MapListSize 100 )
102 (InclusionSize 400 )
103 (SampleBins 10 )
104 (Clusters 3 )
105 (DefaultValue 0 )
106 ) myCooc2
107 printDebug "myCooc3" $ M.size myCooc3
108 putStrLn $ show myCooc3
109
110 -- Cooc -> Matrix
111 let (ti, _) = createIndices myCooc3
112 printDebug "ti" $ M.size ti
113
114 let myCooc4 = toIndex ti myCooc3
115 printDebug "myCooc4" $ M.size myCooc4
116
117 let matCooc = map2mat (0) (M.size ti) myCooc4
118 --printDebug "matCooc" matCooc
119 -- Matrix -> Clustering
120 let distanceMat = conditional matCooc
121 -- let distanceMat = distributional matCooc
122 printDebug "distanceMat" $ A.arrayShape distanceMat
123 --printDebug "distanceMat" distanceMat
124 --
125 let distanceMap = mat2map distanceMat
126 printDebug "distanceMap" $ M.size distanceMap
127 --{-
128 -- let distance = fromIndex fi distanceMap
129 -- printDebug "distance" $ M.size distance
130 ---}
131 partitions <- cLouvain distanceMap
132 -- Building : -> Graph -> JSON
133 printDebug "partitions" $ length partitions
134 --printDebug "partitions" partitions
135 pure $ data2graph (M.toList ti) myCooc4 distanceMap partitions
136
137 -----------------------------------------------------------
138 -- distance should not be a map since we just "toList" it (same as cLouvain)
139 data2graph :: [(Label, Int)] -> Map (Int, Int) Int
140 -> Map (Int, Int) Double
141 -> [LouvainNode]
142 -> Graph
143 data2graph labels coocs distance partitions = Graph nodes edges
144 where
145 community_id_by_node_id = M.fromList [ (n, c) | LouvainNode n c <- partitions ]
146 nodes = [ Node { node_size = maybe 0 identity (M.lookup (n,n) coocs)
147 , node_type = Terms -- or Unknown
148 , node_id = cs (show n)
149 , node_label = T.unwords l
150 , node_attributes =
151 Attributes { clust_default = maybe 0 identity
152 (M.lookup n community_id_by_node_id) } }
153 | (l, n) <- labels ]
154 edges = [ Edge { edge_source = cs (show s)
155 , edge_target = cs (show t)
156 , edge_weight = w
157 , edge_id = cs (show i) }
158 | (i, ((s,t), w)) <- zip ([0..]::[Integer]) (M.toList distance) ]
159 -----------------------------------------------------------
160