-}
-{-# LANGUAGE DataKinds #-}
-{-# LANGUAGE DeriveGeneric #-}
-{-# LANGUAGE FlexibleInstances #-}
-{-# LANGUAGE NoImplicitPrelude #-}
-{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE StandaloneDeriving #-}
-{-# LANGUAGE TypeOperators #-}
+{-# LANGUAGE TypeOperators #-}
{-# LANGUAGE Strict #-}
module Main where
-import qualified Data.Vector as DV
-import qualified Data.Maybe as DMaybe
-
+import Control.Concurrent.Async as CCA (mapConcurrently)
+import Control.Concurrent (getNumCapabilities, myThreadId, threadCapability)
import Control.Monad (zipWithM)
import Control.Monad.IO.Class
-
-import qualified Data.IntMap as DM
-
-import Data.Map (Map)
-import Data.Text (Text)
-import Data.List (cycle, concat)
+import Data.Aeson
+import Data.ByteString.Lazy (writeFile)
+import Data.Either (Either(..))
+import Data.List (cycle, concat, unwords)
import Data.List.Split (chunksOf)
+import Data.Map (Map)
+import qualified Data.Map as DM
+import Data.Text (pack, Text)
+import qualified Data.Text as DT
+import Data.Tuple.Extra (both)
+import qualified Data.Vector as DV
+import GHC.Generics
import System.IO (hPutStr, hFlush, stderr)
import System.Environment
-import Control.Concurrent.Async as CCA (mapConcurrently)
-import Control.Concurrent (getNumCapabilities)
-import Prelude ((>>))
import Gargantext.Prelude
import Gargantext.Core
import Gargantext.Core.Types
-import Gargantext.Text.Terms
-import Gargantext.Text.Terms.WithList
-import Gargantext.Text.Parsers.CSV (readCsv, csv_title, csv_abstract, csv_publication_year)
-import Gargantext.Text.List.CSV (csvGraphTermList)
-import Gargantext.Text.Terms (terms)
-import Gargantext.Text.Metrics.Count (coocOn, Coocs)
+import Gargantext.Core.Text.Terms
+import Gargantext.Core.Text.Context
+import Gargantext.Core.Text.Terms.WithList
+import Gargantext.Core.Text.Corpus.Parsers.CSV (readFile, csv_title, csv_abstract, csv_publication_year, unIntOrDec, fromMIntOrDec, defaultYear)
+import Gargantext.Core.Text.List.Formats.CSV (csvMapTermList)
+import Gargantext.Core.Text.Terms (terms)
+import Gargantext.Core.Text.Metrics.Count (coocOnContexts, Coocs)
+
+------------------------------------------------------------------------
+-- OUTPUT format
+
+data CoocByYear = CoocByYear { year :: Int
+ , nbContexts :: NbContexts
+ , coocurrences :: Map (Text, Text) Coocs
+ } deriving (Show, Generic)
+
+data CoocByYears = CoocByYears { years :: [CoocByYear] }
+ deriving (Show, Generic)
+
+type NbContexts = Int
+
+instance ToJSON CoocByYear
+instance ToJSON CoocByYears
+------------------------------------------------------------------------
+
+filterTermsAndCooc
+ :: Patterns
+ -> (Int, [Text])
+ -> IO CoocByYear -- (Int, (Map (Text, Text) Coocs))
+filterTermsAndCooc patterns (year, ts) = do
+ log "start"
+ r <- coocOnContexts identity <$> mapM (\x -> {-log "work" >>-} terms' patterns x) ts
+ log "stop"
+ pure $ CoocByYear year (length ts) (DM.mapKeys (both DT.unwords) r)
+ where
+ log m = do
+ tid <- myThreadId
+ (p, _) <- threadCapability tid
+ putStrLn . unwords $
+ ["filterTermsAndCooc:", m, show year, "on proc", show p]
+
+main :: IO ()
+main = do
+ [corpusFile, termListFile, outputFile] <- getArgs
+
+ --corpus :: IO (DM.IntMap [[Text]])
+ eCorpusFile <- readFile corpusFile
+ case eCorpusFile of
+ Right cf -> do
+ let corpus = DM.fromListWith (<>)
+ . DV.toList
+ . DV.map (\n -> (fromMIntOrDec defaultYear $ csv_publication_year n, [(csv_title n) <> " " <> (csv_abstract n)]))
+ . snd $ cf
+
+ -- termListMap :: [Text]
+ termList <- csvMapTermList termListFile
+
+ putStrLn $ show $ length termList
+
+ let patterns = buildPatterns termList
+
+ -- r <- mapConcurrentlyChunked (filterTermsAndCooc patterns) (DM.toList corpus)
+ r <- mapConcurrently (filterTermsAndCooc patterns) (DM.toList corpus)
+ writeFile outputFile $ encode (CoocByYears r)
+ Left e -> panic $ "Error: " <> (pack e)
+
+
+
+------------------------------------------------------------------------
+-- | Tools
mapMP :: MonadIO m => (a -> m b) -> [a] -> m [b]
mapMP f xs = do
bs <- zipWithM g (cycle "-\\|/") xs
liftIO $ hFlush stderr
f x
+-- | Optimi that need further developments (not used yet)
mapConcurrentlyChunked :: (a -> IO b) -> [a] -> IO [b]
mapConcurrentlyChunked f ts = do
caps <- getNumCapabilities
let n = 1 `max` (length ts `div` caps)
concat <$> mapConcurrently (mapM f) (chunksOf n ts)
-filterTermsAndCooc
- :: TermType Lang
- -> (Int, [Text])
- -> IO (Map (Terms, Terms) Coocs)
-filterTermsAndCooc patterns (year, ts) = do
- log "start"
- r <- coocOn identity <$> mapM (\x -> {-log "work" >>-} terms patterns x) ts
- log "stop"
- pure r
- where
- log m = putStrLn $ "filterTermsAndCooc: " <> m <> " " <> show year
---main :: IO [()]
-main = do
- [corpusFile, termListFile, _] <- getArgs
+--terms' :: Patterns -> Text -> Corpus [[Text]]
+terms' pats txt = pure $ concat $ extractTermsWithList pats txt
- --corpus :: IO (DM.IntMap [[Text]])
- corpus <- DM.fromListWith (<>)
- . DV.toList
- . DV.map (\n -> (csv_publication_year n, [(csv_title n) <> " " <> (csv_abstract n)]))
- . snd
- <$> readCsv corpusFile
- -- termListMap :: [Text]
- termList <- csvGraphTermList termListFile
+-- | TODO Minimal Example
+--testCooc = do
+-- let patterns = buildPatterns testTermList
+-- mapM (\x -> {-log "work" >>-} terms' patterns x) $ catMaybes $ map (head . snd) testCorpus
+-- --mapConcurrently (filterTermsAndCooc patterns) testCorpus
+
- putStrLn $ show $ length termList
+testCorpus :: [(Int, [Text])]
+testCorpus = [ (1998, [pack "The beees"])
+ , (1999, [ pack "The bees and the flowers"
+ --, pack "The bees and the flowers"
+ ])
+ ]
- let patterns = WithList $ buildPatterns termList
+testTermList :: TermList
+testTermList = [ ([pack "bee"], [[pack "bees"]])
+ , ([pack "flower"], [[pack "flowers"]])
+ ]
- r <- mapConcurrentlyChunked (filterTermsAndCooc patterns) (DM.toList corpus)
- putStrLn $ show r
- --writeFile outputFile cooc