]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Pipeline.hs
[TEXTLINE] adding CSV format parser.
[gargantext.git] / src / Gargantext / Pipeline.hs
1 {-|
2 Module : Gargantext.Pipeline
3 Description : Server API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11 {-# OPTIONS_GHC -fno-warn-name-shadowing #-}
12 {-# LANGUAGE NoImplicitPrelude #-}
13
14 module Gargantext.Pipeline
15 where
16
17 import qualified Data.Text as T
18 import Data.Text.IO (readFile)
19
20 import Control.Arrow ((***))
21 import Data.Map.Strict (Map)
22 import qualified Data.Array.Accelerate as A
23 import qualified Data.Map.Strict as M
24 import qualified Data.List as L
25 import Data.Tuple.Extra (both)
26 ----------------------------------------------
27 import Gargantext.Core (Lang(FR))
28 import Gargantext.Core.Types (Label)
29 import Gargantext.Prelude
30 import Prelude (print, seq)
31
32 import Gargantext.Viz.Graph.Index (score, createIndices, toIndex, fromIndex, cooc2mat, map2mat, mat2map)
33 import Gargantext.Viz.Graph.Distances.Matrice (conditional', conditional, distributional)
34 import Gargantext.Viz.Graph.Index (Index)
35 import Gargantext.Viz.Graph (Graph(..), Node(..), Edge(..), Attributes(..), TypeNode(..))
36 import Gargantext.Text.Metrics.Count (cooc)
37 import Gargantext.Text.Metrics
38 import Gargantext.Text.Terms (TermType(Multi, Mono), extractTerms)
39 import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
40
41 import Gargantext.Text.Parsers.CSV
42
43 import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain, LouvainNode(..))
44
45
46 {-
47 ____ _ _
48 / ___| __ _ _ __ __ _ __ _ _ __ | |_ _____ _| |_
49 | | _ / _` | '__/ _` |/ _` | '_ \| __/ _ \ \/ / __|
50 | |_| | (_| | | | (_| | (_| | | | | || __/> <| |_
51 \____|\__,_|_| \__, |\__,_|_| |_|\__\___/_/\_\\__|
52 |___/
53
54 -}
55
56 data WorkType = CSV | FullText
57
58 -- workflow :: Lang (EN|FR) -> FilePath -> Graph
59 workflow termsLang workType path = do
60 -- Text <- IO Text <- FilePath
61 contexts <- case workType of
62 FullText -> splitBy (Sentences 5) <$> readFile path
63 CSV -> readCsvOn [csv_title, csv_abstract] path
64
65 -- Context :: Text -> [Text]
66 -- Contexts = Paragraphs n | Sentences n | Chars n
67
68 myterms <- extractTerms (Mono FR) contexts
69 -- TermsType = Mono | Multi | MonoMulti
70 -- myterms # filter (\t -> not . elem t stopList)
71 -- # groupBy (Stem|GroupList|Ontology)
72 printDebug "myterms" (sum $ map length myterms)
73
74 -- Bulding the map list
75 -- compute copresences of terms
76 -- Cooc = Map (Term, Term) Int
77 let myCooc1 = cooc myterms
78 printDebug "myCooc1" (M.size myCooc1)
79
80 -- Remove Apax: appears one time only => lighting the matrix
81 let myCooc2 = M.filter (>1) myCooc1
82 printDebug "myCooc2" (M.size myCooc2)
83
84 -- Filtering terms with inclusion/Exclusion and Specifity/Genericity scores
85 let myCooc3 = filterCooc ( FilterConfig (MapListSize 1000 )
86 (InclusionSize 4000 )
87 (SampleBins 10 )
88 (Clusters 3 )
89 (DefaultValue 0 )
90 ) myCooc2
91 printDebug "myCooc3" $ M.size myCooc3
92
93 -- Cooc -> Matrix
94 let (ti, fi) = createIndices myCooc3
95 printDebug "ti" $ M.size ti
96
97 let myCooc4 = toIndex ti myCooc3
98 printDebug "myCooc4" $ M.size myCooc4
99
100 let matCooc = map2mat (0) (M.size ti) myCooc4
101 --printDebug "matCooc" matCooc
102 -- Matrix -> Clustering
103 let distanceMat = conditional matCooc
104 -- let distanceMat = distributional matCooc
105 printDebug "distanceMat" $ A.arrayShape distanceMat
106 --printDebug "distanceMat" distanceMat
107 --
108 let distanceMap = mat2map distanceMat
109 printDebug "distanceMap" $ M.size distanceMap
110 --{-
111 -- let distance = fromIndex fi distanceMap
112 -- printDebug "distance" $ M.size distance
113 ---}
114 partitions <- cLouvain distanceMap
115 ------ | Building : -> Graph -> JSON
116 printDebug "partitions" $ length partitions
117 --printDebug "partitions" partitions
118 pure $ data2graph (M.toList ti) myCooc4 distanceMap partitions
119
120
121 -----------------------------------------------------------
122 -- distance should not be a map since we just "toList" it (same as cLouvain)
123 data2graph :: [(Label, Int)] -> Map (Int, Int) Int
124 -> Map (Int, Int) Double
125 -> [LouvainNode]
126 -> Graph
127 data2graph labels coocs distance partitions = Graph nodes edges
128 where
129 community_id_by_node_id = M.fromList [ (n, c) | LouvainNode n c <- partitions ]
130 nodes = [ Node { n_size = maybe 0 identity (M.lookup (n,n) coocs)
131 , n_type = Terms -- or Unknown
132 , n_id = cs (show n)
133 , n_label = T.unwords l
134 , n_attributes =
135 Attributes { clust_default = maybe 0 identity
136 (M.lookup n community_id_by_node_id) } }
137 | (l, n) <- labels ]
138 edges = [ Edge { e_source = s
139 , e_target = t
140 , e_weight = w
141 , e_id = i }
142 | (i, ((s,t), w)) <- zip [0..] (M.toList distance) ]
143 -----------------------------------------------------------
144
145 printDebug msg x = putStrLn $ msg <> " " <> show x
146 --printDebug _ _ = pure ()
147
148
149
150