2 Module : Gargantext.TextFlow
3 Description : Server API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 From text to viz, all the flow of texts in Gargantext.
14 {-# OPTIONS_GHC -fno-warn-name-shadowing #-}
15 {-# LANGUAGE NoImplicitPrelude #-}
17 module Gargantext.TextFlow
20 import GHC.IO (FilePath)
21 import qualified Data.Text as T
22 import Data.Text.IO (readFile)
25 import qualified Data.Array.Accelerate as A
26 import qualified Data.Map.Strict as M
27 ----------------------------------------------
28 import Gargantext.Core (Lang)
29 import Gargantext.Prelude
31 import Gargantext.Viz.Graph.Index (createIndices, toIndex, map2mat, mat2map)
32 import Gargantext.Viz.Graph.Distances.Matrice (distributional)
33 import Gargantext.Viz.Graph (Graph(..), data2graph)
34 import Gargantext.Text.Metrics.Count (cooc)
35 import Gargantext.Text.Metrics (filterCooc, FilterConfig(..), Clusters(..), SampleBins(..), DefaultValue(..), MapListSize(..), InclusionSize(..))
36 import Gargantext.Text.Terms (TermType, extractTerms)
37 import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
39 import Gargantext.Text.Parsers.CSV
41 import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain)
45 / ___| __ _ _ __ __ _ __ _ _ __ | |_ _____ _| |_
46 | | _ / _` | '__/ _` |/ _` | '_ \| __/ _ \ \/ / __|
47 | |_| | (_| | | | (_| | (_| | | | | || __/> <| |_
48 \____|\__,_|_| \__, |\__,_|_| |_|\__\___/_/\_\\__|
53 data TextFlow = CSV FilePath
61 textFlow :: TermType Lang -> TextFlow -> IO Graph
62 textFlow termType workType = do
63 contexts <- case workType of
64 FullText path -> splitBy (Sentences 5) <$> readFile path
65 CSV path -> readCsvOn [csv_title, csv_abstract] path
66 Contexts ctxt -> pure ctxt
69 textFlow' termType contexts
72 textFlow' :: TermType Lang -> [T.Text] -> IO Graph
73 textFlow' termType contexts = do
74 -- Context :: Text -> [Text]
75 -- Contexts = Paragraphs n | Sentences n | Chars n
77 myterms <- extractTerms termType contexts
78 -- TermsType = Mono | Multi | MonoMulti
79 -- myterms # filter (\t -> not . elem t stopList)
80 -- # groupBy (Stem|GroupList|Ontology)
81 printDebug "myterms" (sum $ map length myterms)
83 -- Bulding the map list
84 -- compute copresences of terms, i.e. cooccurrences of terms in same context of text
85 -- Cooc = Map (Term, Term) Int
86 let myCooc1 = cooc myterms
87 printDebug "myCooc1" (M.size myCooc1)
89 -- Remove Apax: appears one time only => lighting the matrix
90 let myCooc2 = M.filter (>1) myCooc1
91 printDebug "myCooc2" (M.size myCooc2)
93 -- Filtering terms with inclusion/Exclusion and Specificity/Genericity scores
94 let myCooc3 = filterCooc ( FilterConfig (MapListSize 100 )
100 printDebug "myCooc3" $ M.size myCooc3
101 -- putStrLn $ show myCooc3
104 let (ti, _) = createIndices myCooc3
105 printDebug "ti" $ M.size ti
107 let myCooc4 = toIndex ti myCooc3
108 printDebug "myCooc4" $ M.size myCooc4
110 let matCooc = map2mat (0) (M.size ti) myCooc4
111 printDebug "matCooc" matCooc
113 -- Matrix -> Clustering
114 --let distanceMat = conditional' matCooc
115 let distanceMat = distributional matCooc
116 printDebug "distanceMat" $ A.arrayShape distanceMat
117 printDebug "distanceMat" distanceMat
119 let distanceMap = mat2map distanceMat
120 printDebug "distanceMap" $ M.size distanceMap
122 -- let distance = fromIndex fi distanceMap
123 -- printDebug "distance" $ M.size distance
125 partitions <- cLouvain distanceMap
126 -- Building : -> Graph -> JSON
127 printDebug "partitions" $ length partitions
128 --printDebug "partitions" partitions
129 pure $ data2graph (M.toList ti) myCooc4 distanceMap partitions