]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Pipeline.hs
[FIX] Corenlp lems -> using stem.
[gargantext.git] / src / Gargantext / Pipeline.hs
1 {-|
2 Module : Gargantext.Pipeline
3 Description : Server API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# OPTIONS_GHC -fno-warn-name-shadowing #-}
13 {-# LANGUAGE NoImplicitPrelude #-}
14
15 module Gargantext.Pipeline
16 where
17
18 import Data.Text.IO (readFile)
19
20 import Control.Arrow ((***))
21 import Data.Map.Strict (Map)
22 import qualified Data.Map.Strict as M
23 import qualified Data.Set as S
24 import qualified Data.List as L
25 import Data.Tuple.Extra (both)
26 ----------------------------------------------
27 import Gargantext.Core (Lang(FR))
28 import Gargantext.Prelude
29
30 import Gargantext.Viz.Graph.Index (score, createIndices, toIndex, fromIndex, cooc2mat, mat2map)
31 import Gargantext.Viz.Graph.Distances.Matrice (incExcSpeGen, conditional)
32 import Gargantext.Viz.Graph.Index (Index)
33 import Gargantext.Text.Metrics.Count (cooc, removeApax)
34 import Gargantext.Text.Terms (TermType(Multi, Mono), extractTerms)
35 import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
36
37 import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain)
38
39 --filterCooc :: Ord t => Map (t, t) Int -> Map (t, t) Int
40 --filterCooc m =
41 ---- filterCooc m = foldl (\k -> maybe (panic "no key") identity $ M.lookup k m) M.empty selection
42 ----(ti, fi) = createIndices m
43 -- . fromIndex fi $ filterMat $ cooc2mat ti m
44
45
46
47 import Data.Array.Accelerate (Matrix)
48 filterMat :: Matrix Int -> [(Index, Index)]
49 filterMat m = S.toList $ S.take n $ S.fromList $ (L.take nIe incExc') <> (L.take nSg speGen')
50 where
51 (incExc', speGen') = both ( map fst . L.sortOn snd . M.toList . mat2map) (incExcSpeGen m)
52 n = nIe + nSg
53 nIe = 30
54 nSg = 70
55
56
57 pipeline path = do
58 -- Text <- IO Text <- FilePath
59 text <- readFile path
60 let contexts = splitBy (Sentences 3) text
61 myterms <- extractTerms Multi FR contexts
62
63 -- TODO filter (\t -> not . elem t stopList) myterms
64 -- TODO groupBy (Stem | GroupList)
65
66 let myCooc = removeApax $ cooc myterms
67 let (ti, fi) = createIndices myCooc
68 pure ti
69 -- Cooc -> Matrix
70
71 -- -- filter by spec/gen (dynmaic programming)
72 -- let theScores = M.filter (>0) $ score conditional myCoocFiltered
73 ----
74 ------ -- Matrix -> Clustering
75 ------ pure $ bestpartition False $ map2graph $ toIndex ti theScores
76 -- partitions <- cLouvain theScores
77 -- pure partitions
78 ---- | Building : -> Graph -> JSON
79
80