3 Description : Gargantext starter
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Main specifications to index a corpus with a term list
14 {-# LANGUAGE DataKinds #-}
15 {-# LANGUAGE DeriveGeneric #-}
16 {-# LANGUAGE FlexibleInstances #-}
17 {-# LANGUAGE NoImplicitPrelude #-}
18 {-# LANGUAGE OverloadedStrings #-}
19 {-# LANGUAGE StandaloneDeriving #-}
20 {-# LANGUAGE TypeOperators #-}
21 {-# LANGUAGE Strict #-}
25 import Data.ByteString.Lazy (writeFile)
27 import Data.Maybe (catMaybes)
28 import Data.Text (pack)
29 import qualified Data.Text as DT
31 import Data.Tuple.Extra (both)
32 import qualified Data.Vector as DV
33 import qualified Data.Maybe as DMaybe
35 import Control.Monad (zipWithM)
36 import Control.Monad.IO.Class
39 import qualified Data.IntMap as DIM
40 import qualified Data.Map as DM
45 import Data.Text (Text)
46 import Data.List (cycle, concat, unwords)
47 import Data.List.Split (chunksOf)
48 import System.IO (hPutStr, hFlush, stderr)
49 import System.Environment
50 import Control.Concurrent.Async as CCA (mapConcurrently)
51 import Control.Concurrent (getNumCapabilities, myThreadId, threadCapability)
54 import Gargantext.Prelude
55 import Gargantext.Core
56 import Gargantext.Core.Types
57 import Gargantext.Text.Terms
58 import Gargantext.Text.Context
59 import Gargantext.Text.Terms.WithList
60 import Gargantext.Text.Corpus.Parsers.CSV (readFile, csv_title, csv_abstract, csv_publication_year)
61 import Gargantext.Text.List.CSV (csvGraphTermList)
62 import Gargantext.Text.Terms (terms)
63 import Gargantext.Text.Metrics.Count (coocOnContexts, Coocs)
65 ------------------------------------------------------------------------
68 data CoocByYear = CoocByYear { year :: Int
69 , nbContexts :: NbContexts
70 , coocurrences :: Map (Text, Text) Coocs
71 } deriving (Show, Generic)
73 data CoocByYears = CoocByYears { years :: [CoocByYear] }
74 deriving (Show, Generic)
78 instance ToJSON CoocByYear
79 instance ToJSON CoocByYears
80 ------------------------------------------------------------------------
85 -> IO CoocByYear -- (Int, (Map (Text, Text) Coocs))
86 filterTermsAndCooc patterns (year, ts) = do
88 r <- coocOnContexts identity <$> mapM (\x -> {-log "work" >>-} terms' patterns x) ts
90 pure $ CoocByYear year (length ts) (DM.mapKeys (both DT.unwords) r)
95 (p, _) <- threadCapability tid
97 ["filterTermsAndCooc:", m, show year, "on proc", show p]
101 [corpusFile, termListFile, outputFile] <- getArgs
103 --corpus :: IO (DM.IntMap [[Text]])
104 corpus <- DM.fromListWith (<>)
106 . DV.map (\n -> (csv_publication_year n, [(csv_title n) <> " " <> (csv_abstract n)]))
108 <$> readFile corpusFile
110 -- termListMap :: [Text]
111 termList <- csvGraphTermList termListFile
113 putStrLn $ show $ length termList
115 let patterns = buildPatterns termList
117 -- r <- mapConcurrentlyChunked (filterTermsAndCooc patterns) (DM.toList corpus)
118 r <- mapConcurrently (filterTermsAndCooc patterns) (DM.toList corpus)
119 writeFile outputFile $ encode (CoocByYears r)
123 ------------------------------------------------------------------------
125 mapMP :: MonadIO m => (a -> m b) -> [a] -> m [b]
127 bs <- zipWithM g (cycle "-\\|/") xs
128 liftIO $ hPutStr stderr "\rDone\n"
132 liftIO $ hPutStr stderr ['\r',c]
133 liftIO $ hFlush stderr
136 -- | Optimi that need further developments (not used yet)
137 mapConcurrentlyChunked :: (a -> IO b) -> [a] -> IO [b]
138 mapConcurrentlyChunked f ts = do
139 caps <- getNumCapabilities
140 let n = 1 `max` (length ts `div` caps)
141 concat <$> mapConcurrently (mapM f) (chunksOf n ts)
144 --terms' :: Patterns -> Text -> Corpus [[Text]]
145 terms' pats txt = pure $ concat $ extractTermsWithList pats txt
148 -- | TODO Minimal Example
150 -- let patterns = buildPatterns testTermList
151 -- mapM (\x -> {-log "work" >>-} terms' patterns x) $ catMaybes $ map (head . snd) testCorpus
152 -- --mapConcurrently (filterTermsAndCooc patterns) testCorpus
155 testCorpus :: [(Int, [Text])]
156 testCorpus = [ (1998, [pack "The beees"])
157 , (1999, [ pack "The bees and the flowers"
158 --, pack "The bees and the flowers"
162 testTermList :: TermList
163 testTermList = [ ([pack "bee"], [[pack "bees"]])
164 , ([pack "flower"], [[pack "flowers"]])