Portability : POSIX
-}
-
module Gargantext.Core.Viz.Phylo.PhyloMaker where
+import Control.DeepSeq (NFData)
+import Control.Lens hiding (Level)
+import Control.Parallel.Strategies (parList, rdeepseq, using)
import Data.List (concat, nub, partition, sort, (++), group, intersect, null, sortOn, groupBy, tail)
import Data.Map (Map, fromListWith, keys, unionWith, fromList, empty, toList, elems, (!), restrictKeys, foldlWithKey, insert)
+import Data.Text (Text)
import Data.Vector (Vector)
+import Debug.Trace (trace)
-import Gargantext.Prelude
-import Gargantext.Core.Viz.AdaptativePhylo
+import Gargantext.Core.Methods.Distances (Distance(Conditional))
+import Gargantext.Core.Methods.Graph.MaxClique (getMaxCliques)
+import Gargantext.Core.Text.Context (TermList)
+import Gargantext.Core.Text.Metrics.FrequentItemSet (fisWithSizePolyMap, fisWithSizePolyMap', Size(..))
+import Gargantext.Core.Viz.Phylo
+import Gargantext.Core.Viz.Phylo.PhyloExport (toHorizon)
import Gargantext.Core.Viz.Phylo.PhyloTools
-import Gargantext.Core.Viz.Phylo.TemporalMatching (adaptativeTemporalMatching, constanteTemporalMatching, getNextPeriods, filterDocs, filterDiago, reduceDiagos, toProximity)
import Gargantext.Core.Viz.Phylo.SynchronicClustering (synchronicClustering)
-import Gargantext.Core.Text.Context (TermList)
-import Gargantext.Core.Text.Metrics.FrequentItemSet (fisWithSizePolyMap, Size(..))
-import Gargantext.Core.Methods.Graph.MaxClique (getMaxCliques)
-import Gargantext.Core.Methods.Distances (Distance(Conditional))
-
-import Control.DeepSeq (NFData)
-import Control.Parallel.Strategies (parList, rdeepseq, using)
-import Debug.Trace (trace)
-import Control.Lens hiding (Level)
+import Gargantext.Core.Viz.Phylo.TemporalMatching (adaptativeTemporalMatching, constanteTemporalMatching, getNextPeriods, filterDocs, filterDiago, reduceDiagos, toProximity)
+import Gargantext.Prelude
-import qualified Data.Vector as Vector
import qualified Data.Set as Set
+import qualified Data.Vector as Vector
------------------
-- | To Phylo | --
| PhyloN { _phylo'_phylo1 :: Phylo}
-toPhylo' :: Phylo' -> [Document] -> TermList -> Config -> Phylo
-toPhylo' (PhyloN phylo) = toPhylo'
-toPhylo' (PhyloBase phylo) = toPhylo
+toPhylo' :: Phylo' -> [Document] -> TermList -> PhyloConfig -> Phylo
+toPhylo' (PhyloN phylo) = toPhylo'
+toPhylo' (PhyloBase phylo) = toPhylo
-}
-toPhylo :: [Document] -> TermList -> Config -> Phylo
-toPhylo docs lst conf = trace ("# phylo1 groups " <> show(length $ getGroupsFromLevel 1 phylo1))
- $ traceToPhylo (phyloLevel conf) $
- if (phyloLevel conf) > 1
- then foldl' (\phylo' _ -> synchronicClustering phylo') phylo1 [2..(phyloLevel conf)]
- else phylo1
+toPhylo :: Phylo -> Phylo
+toPhylo phyloStep = trace ("# phylo1 groups " <> show(length $ getGroupsFromLevel 1 phylo1))
+ $ traceToPhylo (phyloLevel $ getConfig phyloStep) $
+ if (phyloLevel $ getConfig phyloStep) > 1
+ then foldl' (\phylo' _ -> synchronicClustering phylo') phyloAncestors [2..(phyloLevel $ getConfig phyloStep)]
+ else phylo1
where
--------------------------------------
- phylo1 :: Phylo
- phylo1 = toPhylo1 docs phyloBase
- -- > AD to db here
+ phyloAncestors :: Phylo
+ phyloAncestors =
+ if (findAncestors $ getConfig phyloStep)
+ then toHorizon phylo1
+ else phylo1
--------------------------------------
- phyloBase :: Phylo
- phyloBase = toPhyloBase docs lst conf
- -- > AD to db here
+ phylo1 :: Phylo
+ phylo1 = toPhylo1 phyloStep
--------------------------------------
-
--------------------
-- | To Phylo 1 | --
--------------------
toGroupsProxi :: Level -> Phylo -> Phylo
-toGroupsProxi lvl phylo =
+toGroupsProxi lvl phylo =
let proximity = phyloProximity $ getConfig phylo
- groupsProxi = foldlWithKey (\acc pId pds ->
+ groupsProxi = foldlWithKey (\acc pId pds ->
-- 1) process period by period
let egos = map (\g -> (getGroupId g, g ^. phylo_groupNgrams))
- $ elems
- $ view ( phylo_periodLevels
- . traverse . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
+ $ elems
+ $ view ( phylo_periodLevels
+ . traverse . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
. phylo_levelGroups ) pds
next = getNextPeriods ToParents (getTimeFrame $ timeUnit $ getConfig phylo) pId (keys $ phylo ^. phylo_periods)
targets = map (\g -> (getGroupId g, g ^. phylo_groupNgrams)) $ getGroupsFromLevelPeriods lvl next phylo
docs = filterDocs (phylo ^. phylo_timeDocs) ([pId] ++ next)
diagos = filterDiago (phylo ^. phylo_timeCooc) ([pId] ++ next)
-- 2) compute the pairs in parallel
- pairs = map (\(id,ngrams) ->
- map (\(id',ngrams') ->
+ pairs = map (\(id,ngrams) ->
+ map (\(id',ngrams') ->
let nbDocs = (sum . elems) $ filterDocs docs ([idToPrd id, idToPrd id'])
diago = reduceDiagos $ filterDiago diagos ([idToPrd id, idToPrd id'])
in ((id,id'),toProximity nbDocs diago proximity ngrams ngrams' ngrams')
- ) $ filter (\(_,ngrams') -> (not . null) $ intersect ngrams ngrams') targets
+ ) $ filter (\(_,ngrams') -> (not . null) $ intersect ngrams ngrams') targets
) egos
pairs' = pairs `using` parList rdeepseq
in acc ++ (concat pairs')
) [] $ phylo ^. phylo_periods
- in phylo & phylo_groupsProxi .~ ((traceGroupsProxi . fromList) groupsProxi)
+ in phylo & phylo_groupsProxi .~ ((traceGroupsProxi . fromList) groupsProxi)
-appendGroups :: (a -> PhyloPeriodId -> Level -> Int -> [Cooc] -> PhyloGroup) -> Level -> Map (Date,Date) [a] -> Phylo -> Phylo
+appendGroups :: (a -> PhyloPeriodId -> (Text,Text) -> Level -> Int -> [Cooc] -> PhyloGroup) -> Level -> Map (Date,Date) [a] -> Phylo -> Phylo
appendGroups f lvl m phylo = trace ("\n" <> "-- | Append " <> show (length $ concat $ elems m) <> " groups to Level " <> show (lvl) <> "\n")
$ over ( phylo_periods
. traverse
. traverse)
(\phyloLvl -> if lvl == (phyloLvl ^. phylo_levelLevel)
then
- let pId = phyloLvl ^. phylo_levelPeriod
+ let pId = phyloLvl ^. phylo_levelPeriod
+ pId' = phyloLvl ^. phylo_levelPeriod'
phyloCUnit = m ! pId
- in phyloLvl
+ in phyloLvl
& phylo_levelGroups .~ (fromList $ foldl (\groups obj ->
groups ++ [ (((pId,lvl),length groups)
- , f obj pId lvl (length groups)
+ , f obj pId pId' lvl (length groups)
(elems $ restrictKeys (phylo ^. phylo_timeCooc) $ periodsToYears [pId]))
] ) [] phyloCUnit)
- else
+ else
phyloLvl )
- phylo
+ phylo
-cliqueToGroup :: PhyloClique -> PhyloPeriodId -> Level -> Int -> [Cooc] -> PhyloGroup
-cliqueToGroup fis pId lvl idx coocs = PhyloGroup pId lvl idx ""
+cliqueToGroup :: PhyloClique -> PhyloPeriodId -> (Text,Text) -> Level -> Int -> [Cooc] -> PhyloGroup
+cliqueToGroup fis pId pId' lvl idx coocs = PhyloGroup pId pId' lvl idx ""
(fis ^. phyloClique_support)
+ (fis ^. phyloClique_weight)
+ (fis ^. phyloClique_sources)
(fis ^. phyloClique_nodes)
(ngramsToCooc (fis ^. phyloClique_nodes) coocs)
(1,[0]) -- branchid (lvl,[path in the branching tree])
(fromList [("breaks",[0]),("seaLevels",[0])])
- [] [] [] [] []
+ [] [] [] [] [] [] []
-toPhylo1 :: [Document] -> Phylo -> Phylo
-toPhylo1 docs phyloBase = case (getSeaElevation phyloBase) of
- Constante start gap -> constanteTemporalMatching start gap
- $ appendGroups cliqueToGroup 1 phyloClique phyloBase
- Adaptative steps -> adaptativeTemporalMatching steps
- $ toGroupsProxi 1
- $ appendGroups cliqueToGroup 1 phyloClique phyloBase
+toPhylo1 :: Phylo -> Phylo
+toPhylo1 phyloStep = case (getSeaElevation phyloStep) of
+ Constante start gap -> constanteTemporalMatching start gap phyloStep
+ Adaptative steps -> adaptativeTemporalMatching steps phyloStep
+
+-----------------------
+-- | To Phylo Step | --
+-----------------------
+
+
+indexDates' :: Map (Date,Date) [Document] -> Map (Date,Date) (Text,Text)
+indexDates' m = map (\docs ->
+ let ds = map (\d -> date' d) docs
+ f = if (null ds)
+ then ""
+ else toFstDate ds
+ l = if (null ds)
+ then ""
+ else toLstDate ds
+ in (f,l)) m
+
+
+-- To build the first phylo step from docs and terms
+-- QL: backend entre phyloBase et phyloClique
+toPhyloStep :: [Document] -> TermList -> PhyloConfig -> Phylo
+toPhyloStep docs lst conf = case (getSeaElevation phyloBase) of
+ Constante _ _ -> appendGroups cliqueToGroup 1 phyloClique (updatePeriods (indexDates' docs') phyloBase)
+ Adaptative _ -> toGroupsProxi 1
+ $ appendGroups cliqueToGroup 1 phyloClique (updatePeriods (indexDates' docs') phyloBase)
where
--------------------------------------
phyloClique :: Map (Date,Date) [PhyloClique]
phyloClique = toPhyloClique phyloBase docs'
--------------------------------------
docs' :: Map (Date,Date) [Document]
+ -- QL: Time Consuming here
docs' = groupDocsByPeriodRec date (getPeriodIds phyloBase) (sortOn date docs) empty
- -- docs' = groupDocsByPeriod' date (getPeriodIds phyloBase) docs
--------------------------------------
-
+ phyloBase :: Phylo
+ phyloBase = toPhyloBase docs lst conf
+ --------------------------------------
---------------------------
-- | Frequent Item Set | --
-- To filter nested Fis
filterCliqueByNested :: Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
-filterCliqueByNested m =
- let clq = map (\l ->
+filterCliqueByNested m =
+ let clq = map (\l ->
foldl (\mem f -> if (any (\f' -> isNested (f' ^. phyloClique_nodes) (f ^. phyloClique_nodes)) mem)
then mem
- else
+ else
let fMax = filter (\f' -> not $ isNested (f ^. phyloClique_nodes) (f' ^. phyloClique_nodes)) mem
in fMax ++ [f] ) [] l)
- $ elems m
+ $ elems m
clq' = clq `using` parList rdeepseq
- in fromList $ zip (keys m) clq'
+ in fromList $ zip (keys m) clq'
-- | To transform a time map of docs into a time map of Fis with some filters
toPhyloClique :: Phylo -> Map (Date, Date) [Document] -> Map (Date,Date) [PhyloClique]
-toPhyloClique phylo phyloDocs = case (clique $ getConfig phylo) of
+toPhyloClique phylo phyloDocs = case (clique $ getConfig phylo) of
Fis s s' -> -- traceFis "Filtered Fis"
- filterCliqueByNested
+ filterCliqueByNested
{- \$ traceFis "Filtered by clique size" -}
$ filterClique True s' (filterCliqueBySize)
{- \$ traceFis "Filtered by support" -}
$ filterClique True s (filterCliqueBySupport)
{- \$ traceFis "Unfiltered Fis" -}
phyloClique
- MaxClique s -> filterClique True s (filterCliqueBySize)
- phyloClique
+ MaxClique s _ _ -> filterClique True s (filterCliqueBySize)
+ phyloClique
where
- --------------------------------------
+ --------------------------------------
phyloClique :: Map (Date,Date) [PhyloClique]
- phyloClique = case (clique $ getConfig phylo) of
- Fis _ _ ->
- let fis = map (\(prd,docs) ->
- let lst = toList $ fisWithSizePolyMap (Segment 1 20) 1 (map (\d -> ngramsToIdx (text d) (getRoots phylo)) docs)
- in (prd, map (\f -> PhyloClique (Set.toList $ fst f) (snd f) prd) lst))
+ phyloClique = case (clique $ getConfig phylo) of
+ Fis _ _ ->
+ let fis = map (\(prd,docs) ->
+ case (corpusParser $ getConfig phylo) of
+ Csv' _ -> let lst = toList
+ $ fisWithSizePolyMap' (Segment 1 20) 1 (map (\d -> (ngramsToIdx (text d) (getRoots phylo), (weight d, (sourcesToIdx (sources d) (getSources phylo))))) docs)
+ in (prd, map (\f -> PhyloClique (Set.toList $ fst f) ((fst . snd) f) prd ((fst . snd . snd) f) (((snd . snd . snd) f))) lst)
+ _ -> let lst = toList
+ $ fisWithSizePolyMap (Segment 1 20) 1 (map (\d -> ngramsToIdx (text d) (getRoots phylo)) docs)
+ in (prd, map (\f -> PhyloClique (Set.toList $ fst f) (snd f) prd (Just $ fromIntegral $ snd f) []) lst)
+ )
$ toList phyloDocs
fis' = fis `using` parList rdeepseq
in fromList fis'
- MaxClique _ ->
- let mcl = map (\(prd,docs) ->
+ MaxClique _ thr filterType ->
+ let mcl = map (\(prd,docs) ->
let cooc = map round
$ foldl sumCooc empty
- $ map listToMatrix
+ $ map listToMatrix
$ map (\d -> ngramsToIdx (text d) (getRoots phylo)) docs
- in (prd, map (\cl -> PhyloClique cl 0 prd) $ getMaxCliques Conditional 0.001 cooc))
+ in (prd, map (\cl -> PhyloClique cl 0 prd Nothing []) $ getMaxCliques filterType Conditional thr cooc))
$ toList phyloDocs
- mcl' = mcl `using` parList rdeepseq
- in fromList mcl'
- --------------------------------------
+ mcl' = mcl `using` parList rdeepseq
+ in fromList mcl'
+ --------------------------------------
-- dev viz graph maxClique getMaxClique
--------------------
--- To transform the docs into a time map of coocurency matrix
+-- To transform the docs into a time map of coocurency matrix
docsToTimeScaleCooc :: [Document] -> Vector Ngrams -> Map Date Cooc
-docsToTimeScaleCooc docs fdt =
+docsToTimeScaleCooc docs fdt =
let mCooc = fromListWith sumCooc
$ map (\(_d,l) -> (_d, listToMatrix l))
$ map (\doc -> (date doc, sort $ ngramsToIdx (text doc) fdt)) docs
groupDocsByPeriodRec :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc] -> Map (date, date) [doc]
groupDocsByPeriodRec f prds docs acc =
if ((null prds) || (null docs))
- then acc
- else
+ then acc
+ else
let prd = head' "groupBy" prds
docs' = partition (\d -> (f d >= fst prd) && (f d <= snd prd)) docs
in groupDocsByPeriodRec f (tail prds) (snd docs') (insert prd (fst docs') acc)
let docs' = groupBy (\d d' -> f d == f d') $ sortOn f docs
periods = map (inPeriode f docs') pds
periods' = periods `using` parList rdeepseq
- in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length pds) <> " periods" <> "\n")
+ in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length pds) <> " periods" <> "\n")
$ fromList $ zip pds periods'
where
--------------------------------------
let periods = map (inPeriode f es) pds
periods' = periods `using` parList rdeepseq
- in trace ("\n" <> "-- | Group " <> show(length es) <> " docs by " <> show(length pds) <> " periods" <> "\n")
+ in trace ("\n" <> "-- | Group " <> show(length es) <> " docs by " <> show(length pds) <> " periods" <> "\n")
$ fromList $ zip pds periods'
where
--------------------------------------
inPeriode :: Ord b => (t -> b) -> [t] -> (b, b) -> [t]
inPeriode f' h (start,end) =
fst $ partition (\d -> f' d >= start && f' d <= end) h
- --------------------------------------
+ --------------------------------------
docsToTermFreq :: [Document] -> Vector Ngrams -> Map Int Double
let nbDocs = fromIntegral $ length docs
freqs = map (/(nbDocs))
$ fromList
- $ map (\lst -> (head' "docsToTermFreq" lst, fromIntegral $ length lst))
+ $ map (\lst -> (head' "docsToTermFreq" lst, fromIntegral $ length lst))
$ group $ sort $ concat $ map (\d -> nub $ ngramsToIdx (text d) fdt) docs
sumFreqs = sum $ elems freqs
in map (/sumFreqs) freqs
docsToLastTermFreq :: Int -> [Document] -> Vector Ngrams -> Map Int Double
-docsToLastTermFreq n docs fdt =
+docsToLastTermFreq n docs fdt =
let last = take n $ reverse $ sort $ map date docs
nbDocs = fromIntegral $ length $ filter (\d -> elem (date d) last) docs
freqs = map (/(nbDocs))
$ fromList
- $ map (\lst -> (head' "docsToLastTermFreq" lst, fromIntegral $ length lst))
+ $ map (\lst -> (head' "docsToLastTermFreq" lst, fromIntegral $ length lst))
$ group $ sort $ concat $ map (\d -> nub $ ngramsToIdx (text d) fdt) $ filter (\d -> elem (date d) last) docs
sumFreqs = sum $ elems freqs
- in map (/sumFreqs) freqs
+ in map (/sumFreqs) freqs
-- To count the number of docs by unit of time
docsToTimeScaleNb :: [Document] -> Map Date Double
-docsToTimeScaleNb docs =
+docsToTimeScaleNb docs =
let docs' = fromListWith (+) $ map (\d -> (date d,1)) docs
time = fromList $ map (\t -> (t,0)) $ toTimeScale (keys docs') 1
- in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length time) <> " unit of time" <> "\n")
+ in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length time) <> " unit of time" <> "\n")
$ unionWith (+) time docs'
initPhyloLevels :: Int -> PhyloPeriodId -> Map PhyloLevelId PhyloLevel
-initPhyloLevels lvlMax pId =
- fromList $ map (\lvl -> ((pId,lvl),PhyloLevel pId lvl empty)) [1..lvlMax]
+initPhyloLevels lvlMax pId =
+ fromList $ map (\lvl -> ((pId,lvl),PhyloLevel pId ("","") lvl empty)) [1..lvlMax]
+
-- To init the basic elements of a Phylo
-toPhyloBase :: [Document] -> TermList -> Config -> Phylo
-toPhyloBase docs lst conf =
+toPhyloBase :: [Document] -> TermList -> PhyloConfig -> Phylo
+toPhyloBase docs lst conf =
let foundations = PhyloFoundations (Vector.fromList $ nub $ concat $ map text docs) lst
+ docsSources = PhyloSources (Vector.fromList $ nub $ concat $ map sources docs)
params = defaultPhyloParam { _phyloParam_config = conf }
periods = toPeriods (sort $ nub $ map date docs) (getTimePeriod $ timeUnit conf) (getTimeStep $ timeUnit conf)
- in trace ("\n" <> "-- | Create PhyloBase out of " <> show(length docs) <> " docs \n")
+ in trace ("\n" <> "-- | Create PhyloBase out of " <> show(length docs) <> " docs \n")
$ Phylo foundations
+ docsSources
(docsToTimeScaleCooc docs (foundations ^. foundations_roots))
(docsToTimeScaleNb docs)
(docsToTermFreq docs (foundations ^. foundations_roots))
empty
empty
params
- (fromList $ map (\prd -> (prd, PhyloPeriod prd (initPhyloLevels 1 prd))) periods)
+ (fromList $ map (\prd -> (prd, PhyloPeriod prd ("","") (initPhyloLevels 1 prd))) periods)