-- (c) CP (2012/3-16/17) -- NB: this is not a proper library, it is just an "ad hoc" illustration of -- the map-reduce programming strategy. A more accurate account -- is given in: R. Lammel's "Google's MapReduce programming model --- Revisited", -- http://dx.doi.org/10.1016/j.scico.2007.07.001 import Data.List import Cp import Exp import List import Data.List.Split import BTree import LTree --- a tiny internet with 5 URLs only: db = zip [1..] [doc1, doc2, doc3, doc4, doc5] where doc1 = "This is an example of document and a good example" doc2 = "This is another document" doc3 = "A piece of text is given here as example" doc4 = "Yet another example" doc5 = "This is the fifth document" -- we want to build a searchable index from it: idx = (finalize . reduce . mapstep . prepare) db -- NB: use "pict it" to visualize each step in the pipeline -- analysis of the process pipeline --- document inversion is a costly operation invert :: Eq a => [(a, String)] -> [(String, [a])] invert = cataList (h . (id -|- (conv . (id >< words)) >< id)) where h=either nil ucol -- prepare data for 2 docs per machine prepare = anaLTree lsplit . (chunksOf 2) -- map step (inversions run in parallel) mapstep = fmap invert -- reduce step (join everything together) reduce = cataLTree (either id (uncurry col)) -- build a searchable index finalize = anaBTree qsep -- aux functions conv :: (Eq a, Eq b) => (a, [b]) -> [(b, [a])] conv = map (id> [(b, [a])] -> [(b, [a])] -> [(b, [a])] col m n = m .\ n ++ n .\ m ++ [(a,x++concat[l' | (a',l') <- n, a'==a]) | (a,x) <- m .= n] ucol = uncurry col m .\ n = [(a,b) | (a,b) <- m, not(a `elem` (map fst n))] m .= n = [(a,b) | (a,b) <- m, a `elem` (map fst n)] -------------------------------------------------------