{-| Module : Gargantext.Text.Flow Description : Server API Copyright : (c) CNRS, 2017-Present License : AGPL + CECILL v3 Maintainer : team@gargantext.org Stability : experimental Portability : POSIX From text to viz, all the flow of texts in Gargantext. -} {-# OPTIONS_GHC -fno-warn-name-shadowing #-} {-# LANGUAGE NoImplicitPrelude #-} {-# LANGUAGE OverloadedStrings #-} module Gargantext.Text.Flow where --import qualified Data.Array.Accelerate as A --import qualified Data.Set as DS import Control.Monad.Reader import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain) import Data.Map.Strict (Map) import Data.Maybe (catMaybes) import qualified Data.Map.Strict as M import qualified Data.Text as T import Data.Text (Text) import Data.Text.IO (readFile) import Database.PostgreSQL.Simple (Connection) import GHC.IO (FilePath) import Gargantext.Core (Lang) import Gargantext.Core.Types (CorpusId) import Gargantext.Database.Schema.Node import Gargantext.Database.Types.Node import Gargantext.Prelude import Gargantext.Text.Context (splitBy, SplitContext(Sentences)) import Gargantext.Text.Metrics (filterCooc, FilterConfig(..), Clusters(..), SampleBins(..), DefaultValue(..), MapListSize(..), InclusionSize(..)) import Gargantext.Text.Metrics.Count (cooc) import Gargantext.Text.Parsers.CSV import Gargantext.Text.Terms (TermType, extractTerms) import Gargantext.Viz.Graph (Graph(..), data2graph) import Gargantext.Viz.Graph.Bridgeness (bridgeness) import Gargantext.Viz.Graph.Distances.Matrice (measureConditional) --import Gargantext.Viz.Graph.Distances.Matrice (distributional) import Gargantext.Viz.Graph.Index (createIndices, toIndex, map2mat, mat2map) {- ____ _ _ / ___| __ _ _ __ __ _ __ _ _ __ | |_ _____ _| |_ | | _ / _` | '__/ _` |/ _` | '_ \| __/ _ \ \/ / __| | |_| | (_| | | | (_| | (_| | | | | || __/> <| |_ \____|\__,_|_| \__, |\__,_|_| |_|\__\___/_/\_\\__| |___/ -} contextText :: [T.Text] contextText = ["The dog is an animal." ,"The bird is an animal." ,"The dog is an animal." ,"The animal is a bird or a dog ?" ,"The table is an object." ,"The pen is an object." ,"The object is a pen or a table ?" ,"The girl is a human." ,"The boy is a human." ,"The boy or the girl are human." ] -- | Control the flow of text data TextFlow = CSV FilePath | FullText FilePath | Contexts [T.Text] | DBV3 Connection CorpusId | Query T.Text textFlow :: TermType Lang -> TextFlow -> IO Graph textFlow termType workType = do contexts <- case workType of FullText path -> splitBy (Sentences 5) <$> readFile path CSV path -> readCsvOn [csv_title, csv_abstract] path Contexts ctxt -> pure ctxt DBV3 con corpusId -> catMaybes <$> map (\n -> hyperdataDocumentV3_title (_node_hyperdata n) <> hyperdataDocumentV3_abstract (_node_hyperdata n))<$> runReaderT (getDocumentsV3WithParentId corpusId) con _ -> undefined -- TODO Query not supported textFlow' termType contexts textFlow' :: TermType Lang -> [T.Text] -> IO Graph textFlow' termType contexts = do -- Context :: Text -> [Text] -- Contexts = Paragraphs n | Sentences n | Chars n myterms <- extractTerms termType contexts -- TermsType = Mono | Multi | MonoMulti -- myterms # filter (\t -> not . elem t stopList) -- # groupBy (Stem|GroupList|Ontology) --printDebug "terms" myterms --printDebug "myterms" (sum $ map length myterms) -- Bulding the map list -- compute copresences of terms, i.e. cooccurrences of terms in same context of text -- Cooc = Map (Term, Term) Int let myCooc1 = cooc myterms --printDebug "myCooc1 size" (M.size myCooc1) -- Remove Apax: appears one time only => lighting the matrix let myCooc2 = M.filter (>0) myCooc1 --printDebug "myCooc2 size" (M.size myCooc2) --printDebug "myCooc2" myCooc2 g <- cooc2graph myCooc2 pure g -- TODO use Text only here instead of [Text] cooc2graph :: (Map ([Text], [Text]) Int) -> IO Graph cooc2graph myCooc = do --printDebug "myCooc" myCooc -- Filtering terms with inclusion/Exclusion and Specificity/Genericity scores let myCooc3 = filterCooc ( FilterConfig (MapListSize 350 ) (InclusionSize 500 ) (SampleBins 10 ) (Clusters 3 ) (DefaultValue 0 ) ) myCooc --printDebug "myCooc3 size" $ M.size myCooc3 --printDebug "myCooc3" myCooc3 -- Cooc -> Matrix let (ti, _) = createIndices myCooc3 --printDebug "ti size" $ M.size ti --printDebug "ti" ti let myCooc4 = toIndex ti myCooc3 --printDebug "myCooc4 size" $ M.size myCooc4 --printDebug "myCooc4" myCooc4 let matCooc = map2mat (0) (M.size ti) myCooc4 --printDebug "matCooc shape" $ A.arrayShape matCooc --printDebug "matCooc" matCooc -- Matrix -> Clustering let distanceMat = measureConditional matCooc --let distanceMat = distributional matCooc --printDebug "distanceMat shape" $ A.arrayShape distanceMat --printDebug "distanceMat" distanceMat --let distanceMap = M.filter (>0) $ mat2map distanceMat let distanceMap = M.map (\_ -> 1) $ M.filter (>0) $ mat2map distanceMat --printDebug "distanceMap size" $ M.size distanceMap --printDebug "distanceMap" distanceMap -- let distance = fromIndex fi distanceMap --printDebug "distance" $ M.size distance partitions <- case M.size distanceMap > 0 of True -> cLouvain distanceMap False -> panic "Text.Flow: DistanceMap is empty" -- Building : -> Graph -> JSON --printDebug "partitions" $ DS.size $ DS.fromList $ map (l_community_id) partitions --printDebug "partitions" partitions let distanceMap' = bridgeness 300 partitions distanceMap pure $ data2graph (M.toList ti) myCooc4 distanceMap' partitions