3 Description : Gargantext starter
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Main specifications to index a corpus with a term list
14 {-# LANGUAGE StandaloneDeriving #-}
15 {-# LANGUAGE TypeOperators #-}
16 {-# LANGUAGE Strict #-}
20 import Control.Concurrent.Async as CCA (mapConcurrently)
21 import Control.Concurrent (getNumCapabilities, myThreadId, threadCapability)
22 import Control.Monad (zipWithM)
23 import Control.Monad.IO.Class
25 import Data.ByteString.Lazy (writeFile)
26 import Data.Either (Either(..))
27 import Data.List (cycle, concat, unwords)
28 import Data.List.Split (chunksOf)
30 import qualified Data.Map as DM
31 import Data.Text (pack, Text)
32 import qualified Data.Text as DT
33 import Data.Tuple.Extra (both)
34 import qualified Data.Vector as DV
36 import System.IO (hPutStr, hFlush, stderr)
37 import System.Environment
39 import Gargantext.Prelude
40 import Gargantext.Core
41 import Gargantext.Core.Types
42 import Gargantext.Core.Text.Terms
43 import Gargantext.Core.Text.Context
44 import Gargantext.Core.Text.Terms.WithList
45 import Gargantext.Core.Text.Corpus.Parsers.CSV (readCSVFile, csv_title, csv_abstract, csv_publication_year, unIntOrDec, fromMIntOrDec, defaultYear)
46 import Gargantext.Core.Text.List.Formats.CSV (csvMapTermList)
47 import Gargantext.Core.Text.Terms (terms)
48 import Gargantext.Core.Text.Metrics.Count (coocOnContexts, Coocs)
50 ------------------------------------------------------------------------
53 data CoocByYear = CoocByYear { year :: Int
54 , nbContexts :: NbContexts
55 , coocurrences :: Map (Text, Text) Coocs
56 } deriving (Show, Generic)
58 data CoocByYears = CoocByYears { years :: [CoocByYear] }
59 deriving (Show, Generic)
63 instance ToJSON CoocByYear
64 instance ToJSON CoocByYears
65 ------------------------------------------------------------------------
70 -> IO CoocByYear -- (Int, (Map (Text, Text) Coocs))
71 filterTermsAndCooc patterns (year, ts) = do
73 r <- coocOnContexts identity <$> mapM (\x -> {-log "work" >>-} terms' patterns x) ts
75 pure $ CoocByYear year (length ts) (DM.mapKeys (both DT.unwords) r)
80 (p, _) <- threadCapability tid
82 ["filterTermsAndCooc:", m, show year, "on proc", show p]
86 [corpusFile, termListFile, outputFile] <- getArgs
88 --corpus :: IO (DM.IntMap [[Text]])
89 eCorpusFile <- readCSVFile corpusFile
92 let corpus = DM.fromListWith (<>)
94 . DV.map (\n -> (fromMIntOrDec defaultYear $ csv_publication_year n, [(csv_title n) <> " " <> (csv_abstract n)]))
97 -- termListMap :: [Text]
98 termList <- csvMapTermList termListFile
100 putStrLn $ show $ length termList
102 let patterns = buildPatterns termList
104 -- r <- mapConcurrentlyChunked (filterTermsAndCooc patterns) (DM.toList corpus)
105 r <- mapConcurrently (filterTermsAndCooc patterns) (DM.toList corpus)
106 writeFile outputFile $ encode (CoocByYears r)
107 Left e -> panic $ "Error: " <> (pack e)
111 ------------------------------------------------------------------------
113 mapMP :: MonadIO m => (a -> m b) -> [a] -> m [b]
115 bs <- zipWithM g (cycle "-\\|/") xs
116 liftIO $ hPutStr stderr "\rDone\n"
120 liftIO $ hPutStr stderr ['\r',c]
121 liftIO $ hFlush stderr
124 -- | Optimi that need further developments (not used yet)
125 mapConcurrentlyChunked :: (a -> IO b) -> [a] -> IO [b]
126 mapConcurrentlyChunked f ts = do
127 caps <- getNumCapabilities
128 let n = 1 `max` (length ts `div` caps)
129 concat <$> mapConcurrently (mapM f) (chunksOf n ts)
132 --terms' :: Patterns -> Text -> Corpus [[Text]]
133 terms' pats txt = pure $ concat $ extractTermsWithList pats txt
136 -- | TODO Minimal Example
138 -- let patterns = buildPatterns testTermList
139 -- mapM (\x -> {-log "work" >>-} terms' patterns x) $ catMaybes $ map (head . snd) testCorpus
140 -- --mapConcurrently (filterTermsAndCooc patterns) testCorpus
143 testCorpus :: [(Int, [Text])]
144 testCorpus = [ (1998, [pack "The beees"])
145 , (1999, [ pack "The bees and the flowers"
146 --, pack "The bees and the flowers"
150 testTermList :: TermList
151 testTermList = [ ([pack "bee"], [[pack "bees"]])
152 , ([pack "flower"], [[pack "flowers"]])