3 Description : Gargantext starter
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Main specifications to index a corpus with a term list
14 {-# LANGUAGE StandaloneDeriving #-}
15 {-# LANGUAGE TypeOperators #-}
16 {-# LANGUAGE Strict #-}
20 import Data.ByteString.Lazy (writeFile)
22 import Data.Text (pack)
23 import qualified Data.Text as DT
25 import Data.Tuple.Extra (both)
26 import qualified Data.Vector as DV
28 import Control.Monad (zipWithM)
29 import Control.Monad.IO.Class
32 import qualified Data.Map as DM
37 import Data.Text (Text)
38 import Data.List (cycle, concat, unwords)
39 import Data.List.Split (chunksOf)
40 import System.IO (hPutStr, hFlush, stderr)
41 import System.Environment
42 import Control.Concurrent.Async as CCA (mapConcurrently)
43 import Control.Concurrent (getNumCapabilities, myThreadId, threadCapability)
45 import Gargantext.Prelude
46 import Gargantext.Core
47 import Gargantext.Core.Types
48 import Gargantext.Core.Text.Terms
49 import Gargantext.Core.Text.Context
50 import Gargantext.Core.Text.Terms.WithList
51 import Gargantext.Core.Text.Corpus.Parsers.CSV (readFile, csv_title, csv_abstract, csv_publication_year)
52 import Gargantext.Core.Text.List.Formats.CSV (csvMapTermList)
53 import Gargantext.Core.Text.Terms (terms)
54 import Gargantext.Core.Text.Metrics.Count (coocOnContexts, Coocs)
56 ------------------------------------------------------------------------
59 data CoocByYear = CoocByYear { year :: Int
60 , nbContexts :: NbContexts
61 , coocurrences :: Map (Text, Text) Coocs
62 } deriving (Show, Generic)
64 data CoocByYears = CoocByYears { years :: [CoocByYear] }
65 deriving (Show, Generic)
69 instance ToJSON CoocByYear
70 instance ToJSON CoocByYears
71 ------------------------------------------------------------------------
76 -> IO CoocByYear -- (Int, (Map (Text, Text) Coocs))
77 filterTermsAndCooc patterns (year, ts) = do
79 r <- coocOnContexts identity <$> mapM (\x -> {-log "work" >>-} terms' patterns x) ts
81 pure $ CoocByYear year (length ts) (DM.mapKeys (both DT.unwords) r)
86 (p, _) <- threadCapability tid
88 ["filterTermsAndCooc:", m, show year, "on proc", show p]
92 [corpusFile, termListFile, outputFile] <- getArgs
94 --corpus :: IO (DM.IntMap [[Text]])
95 corpus <- DM.fromListWith (<>)
97 . DV.map (\n -> (csv_publication_year n, [(csv_title n) <> " " <> (csv_abstract n)]))
99 <$> readFile corpusFile
101 -- termListMap :: [Text]
102 termList <- csvMapTermList termListFile
104 putStrLn $ show $ length termList
106 let patterns = buildPatterns termList
108 -- r <- mapConcurrentlyChunked (filterTermsAndCooc patterns) (DM.toList corpus)
109 r <- mapConcurrently (filterTermsAndCooc patterns) (DM.toList corpus)
110 writeFile outputFile $ encode (CoocByYears r)
114 ------------------------------------------------------------------------
116 mapMP :: MonadIO m => (a -> m b) -> [a] -> m [b]
118 bs <- zipWithM g (cycle "-\\|/") xs
119 liftIO $ hPutStr stderr "\rDone\n"
123 liftIO $ hPutStr stderr ['\r',c]
124 liftIO $ hFlush stderr
127 -- | Optimi that need further developments (not used yet)
128 mapConcurrentlyChunked :: (a -> IO b) -> [a] -> IO [b]
129 mapConcurrentlyChunked f ts = do
130 caps <- getNumCapabilities
131 let n = 1 `max` (length ts `div` caps)
132 concat <$> mapConcurrently (mapM f) (chunksOf n ts)
135 --terms' :: Patterns -> Text -> Corpus [[Text]]
136 terms' pats txt = pure $ concat $ extractTermsWithList pats txt
139 -- | TODO Minimal Example
141 -- let patterns = buildPatterns testTermList
142 -- mapM (\x -> {-log "work" >>-} terms' patterns x) $ catMaybes $ map (head . snd) testCorpus
143 -- --mapConcurrently (filterTermsAndCooc patterns) testCorpus
146 testCorpus :: [(Int, [Text])]
147 testCorpus = [ (1998, [pack "The beees"])
148 , (1999, [ pack "The bees and the flowers"
149 --, pack "The bees and the flowers"
153 testTermList :: TermList
154 testTermList = [ ([pack "bee"], [[pack "bees"]])
155 , ([pack "flower"], [[pack "flowers"]])