{-| Module : Gargantext.TextFlow Description : Server API Copyright : (c) CNRS, 2017-Present License : AGPL + CECILL v3 Maintainer : team@gargantext.org Stability : experimental Portability : POSIX From text to viz, all the flow of texts in Gargantext. -} {-# OPTIONS_GHC -fno-warn-name-shadowing #-} {-# LANGUAGE NoImplicitPrelude #-} module Gargantext.TextFlow where import GHC.IO (FilePath) import qualified Data.Text as T import Data.Text.IO (readFile) import qualified Data.Array.Accelerate as A import qualified Data.Map.Strict as M ---------------------------------------------- import Gargantext.Core (Lang) import Gargantext.Prelude import Gargantext.Viz.Graph.Index (createIndices, toIndex, map2mat, mat2map) import Gargantext.Viz.Graph.Distances.Matrice (distributional) import Gargantext.Viz.Graph (Graph(..), data2graph) import Gargantext.Text.Metrics.Count (cooc) import Gargantext.Text.Metrics (filterCooc, FilterConfig(..), Clusters(..), SampleBins(..), DefaultValue(..), MapListSize(..), InclusionSize(..)) import Gargantext.Text.Terms (TermType, extractTerms) import Gargantext.Text.Context (splitBy, SplitContext(Sentences)) import Gargantext.Text.Parsers.CSV import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain) {- ____ _ _ / ___| __ _ _ __ __ _ __ _ _ __ | |_ _____ _| |_ | | _ / _` | '__/ _` |/ _` | '_ \| __/ _ \ \/ / __| | |_| | (_| | | | (_| | (_| | | | | || __/> <| |_ \____|\__,_|_| \__, |\__,_|_| |_|\__\___/_/\_\\__| |___/ -} data TextFlow = CSV FilePath | FullText FilePath | Contexts [T.Text] | SQL Int | Database T.Text -- ExtDatabase Query -- IntDatabase NodeId textFlow :: TermType Lang -> TextFlow -> IO Graph textFlow termType workType = do contexts <- case workType of FullText path -> splitBy (Sentences 5) <$> readFile path CSV path -> readCsvOn [csv_title, csv_abstract] path Contexts ctxt -> pure ctxt _ -> undefined textFlow' termType contexts textFlow' :: TermType Lang -> [T.Text] -> IO Graph textFlow' termType contexts = do -- Context :: Text -> [Text] -- Contexts = Paragraphs n | Sentences n | Chars n myterms <- extractTerms termType contexts -- TermsType = Mono | Multi | MonoMulti -- myterms # filter (\t -> not . elem t stopList) -- # groupBy (Stem|GroupList|Ontology) printDebug "myterms" (sum $ map length myterms) -- Bulding the map list -- compute copresences of terms, i.e. cooccurrences of terms in same context of text -- Cooc = Map (Term, Term) Int let myCooc1 = cooc myterms printDebug "myCooc1" (M.size myCooc1) -- Remove Apax: appears one time only => lighting the matrix let myCooc2 = M.filter (>1) myCooc1 printDebug "myCooc2" (M.size myCooc2) -- Filtering terms with inclusion/Exclusion and Specificity/Genericity scores let myCooc3 = filterCooc ( FilterConfig (MapListSize 100 ) (InclusionSize 900 ) (SampleBins 10 ) (Clusters 3 ) (DefaultValue 0 ) ) myCooc2 printDebug "myCooc3" $ M.size myCooc3 -- putStrLn $ show myCooc3 -- Cooc -> Matrix let (ti, _) = createIndices myCooc3 printDebug "ti" $ M.size ti let myCooc4 = toIndex ti myCooc3 printDebug "myCooc4" $ M.size myCooc4 let matCooc = map2mat (0) (M.size ti) myCooc4 printDebug "matCooc" matCooc -- Matrix -> Clustering --let distanceMat = conditional' matCooc let distanceMat = distributional matCooc printDebug "distanceMat" $ A.arrayShape distanceMat printDebug "distanceMat" distanceMat -- let distanceMap = mat2map distanceMat printDebug "distanceMap" $ M.size distanceMap -- let distance = fromIndex fi distanceMap -- printDebug "distance" $ M.size distance partitions <- cLouvain distanceMap -- Building : -> Graph -> JSON printDebug "partitions" $ length partitions --printDebug "partitions" partitions pure $ data2graph (M.toList ti) myCooc4 distanceMap partitions