3 Description : Gargantext starter
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Main specifications to index a corpus with a term list
14 {-# LANGUAGE StandaloneDeriving #-}
15 {-# LANGUAGE TypeOperators #-}
16 {-# LANGUAGE Strict #-}
20 import Data.ByteString.Lazy (writeFile)
22 import Data.Maybe (catMaybes)
23 import Data.Text (pack)
24 import qualified Data.Text as DT
26 import Data.Tuple.Extra (both)
27 import qualified Data.Vector as DV
28 import qualified Data.Maybe as DMaybe
30 import Control.Monad (zipWithM)
31 import Control.Monad.IO.Class
34 import qualified Data.IntMap as DIM
35 import qualified Data.Map as DM
40 import Data.Text (Text)
41 import Data.List (cycle, concat, unwords)
42 import Data.List.Split (chunksOf)
43 import System.IO (hPutStr, hFlush, stderr)
44 import System.Environment
45 import Control.Concurrent.Async as CCA (mapConcurrently)
46 import Control.Concurrent (getNumCapabilities, myThreadId, threadCapability)
49 import Gargantext.Prelude
50 import Gargantext.Core
51 import Gargantext.Core.Types
52 import Gargantext.Text.Terms
53 import Gargantext.Text.Context
54 import Gargantext.Text.Terms.WithList
55 import Gargantext.Text.Corpus.Parsers.CSV (readFile, csv_title, csv_abstract, csv_publication_year)
56 import Gargantext.Text.List.CSV (csvMapTermList)
57 import Gargantext.Text.Terms (terms)
58 import Gargantext.Text.Metrics.Count (coocOnContexts, Coocs)
60 ------------------------------------------------------------------------
63 data CoocByYear = CoocByYear { year :: Int
64 , nbContexts :: NbContexts
65 , coocurrences :: Map (Text, Text) Coocs
66 } deriving (Show, Generic)
68 data CoocByYears = CoocByYears { years :: [CoocByYear] }
69 deriving (Show, Generic)
73 instance ToJSON CoocByYear
74 instance ToJSON CoocByYears
75 ------------------------------------------------------------------------
80 -> IO CoocByYear -- (Int, (Map (Text, Text) Coocs))
81 filterTermsAndCooc patterns (year, ts) = do
83 r <- coocOnContexts identity <$> mapM (\x -> {-log "work" >>-} terms' patterns x) ts
85 pure $ CoocByYear year (length ts) (DM.mapKeys (both DT.unwords) r)
90 (p, _) <- threadCapability tid
92 ["filterTermsAndCooc:", m, show year, "on proc", show p]
96 [corpusFile, termListFile, outputFile] <- getArgs
98 --corpus :: IO (DM.IntMap [[Text]])
99 corpus <- DM.fromListWith (<>)
101 . DV.map (\n -> (csv_publication_year n, [(csv_title n) <> " " <> (csv_abstract n)]))
103 <$> readFile corpusFile
105 -- termListMap :: [Text]
106 termList <- csvMapTermList termListFile
108 putStrLn $ show $ length termList
110 let patterns = buildPatterns termList
112 -- r <- mapConcurrentlyChunked (filterTermsAndCooc patterns) (DM.toList corpus)
113 r <- mapConcurrently (filterTermsAndCooc patterns) (DM.toList corpus)
114 writeFile outputFile $ encode (CoocByYears r)
118 ------------------------------------------------------------------------
120 mapMP :: MonadIO m => (a -> m b) -> [a] -> m [b]
122 bs <- zipWithM g (cycle "-\\|/") xs
123 liftIO $ hPutStr stderr "\rDone\n"
127 liftIO $ hPutStr stderr ['\r',c]
128 liftIO $ hFlush stderr
131 -- | Optimi that need further developments (not used yet)
132 mapConcurrentlyChunked :: (a -> IO b) -> [a] -> IO [b]
133 mapConcurrentlyChunked f ts = do
134 caps <- getNumCapabilities
135 let n = 1 `max` (length ts `div` caps)
136 concat <$> mapConcurrently (mapM f) (chunksOf n ts)
139 --terms' :: Patterns -> Text -> Corpus [[Text]]
140 terms' pats txt = pure $ concat $ extractTermsWithList pats txt
143 -- | TODO Minimal Example
145 -- let patterns = buildPatterns testTermList
146 -- mapM (\x -> {-log "work" >>-} terms' patterns x) $ catMaybes $ map (head . snd) testCorpus
147 -- --mapConcurrently (filterTermsAndCooc patterns) testCorpus
150 testCorpus :: [(Int, [Text])]
151 testCorpus = [ (1998, [pack "The beees"])
152 , (1999, [ pack "The bees and the flowers"
153 --, pack "The bees and the flowers"
157 testTermList :: TermList
158 testTermList = [ ([pack "bee"], [[pack "bees"]])
159 , ([pack "flower"], [[pack "flowers"]])