Farewell to Disks: Efficient Processing of Obstinate Data

Post on 05-Dec-2014

401 views 0 download

description

Professor Diomidis Spinellis gave a lecture on Farewell to Disks: Efficient Processing of Obstinate Data in the Distinguished Lecturer Series - Leon The Mathematician.

Transcript of Farewell to Disks: Efficient Processing of Obstinate Data

1

1

Αποχαιρετισμός στους Δίσκους:

Αποδοτική Επεξεργασία

Περίπλοκων Δεδομένων

Διομήδης Σπινέλλης

Καθηγητής

Τμήμα Διοικητικής Επιστήμης και Τεχνολογίας

Οικονομικό Πανεπιστήμιο Αθηνών

http://www.dmst.aueb.gr/dds

2

3

4

5

6

1

7

8

9

10

2

11

12

3

13

14

L1 D cache

1.3 ns

L2 cache

9.7 ns

DDR RAM

28.5 ns

Hard disk

25.6 ms

Wors

t case late

ncy (

Log

scale

)

15

16

17

18

1

19

Function call

1.3ns

System call

1.9μs

Local IPC

4.3μs

Remote IPC

1.2ms

Tim

e (

Lo

g s

ca

le)

select Locations.cc1, Divisions.name, avg(CO2), count(*), Locations.lat, Locations.long, POPDENSITY.DENSITY from Papers inner join Locations on Papers.confLocId = Locations.id inner join Divisions on Locations.cc1 = Divisions.country inner join POPDENSITY on Divisions.name = upper(POPDENSITY.name) where Divisions.code = '00' and CO2 notnull group by Locations.cc1 having count(*) > 20 order by avg(CO2) desc;

20

/* Get the data */ if (mcSet.dataLen) { data = xmalloc(mcSet.dataLen); if (lseek(fd, mcSet.data.off, SEEK_SET) == -1) CORRUPT(); if (read(fd, data, mcSet.dataLen) != mcSet.dataLen) CORRUPT(); if (lseek(fd, mcSet.u.firstMsg, SEEK_SET) == -1) CORRUPT(); for (i = 0; i < mcSet.numMsgs; ++i) { if (read(fd, &mcMsg, sizeof(mcMsg)) != sizeof(mcMsg)) CORRUPT(); if (mcMsg.invalid) { --i; continue; } msg = xmalloc(sizeof(msgT)); memset(msg, '\0', sizeof(*msg)); /* […] */ msg->msgId = mcMsg.msgId; msg->str = xstrdup((char *) (data + mcMsg.msg.off)); } free(data); }

2

21

MMAP(2) FreeBSD System Calls Manual MMAP(2) NAME mmap -- allocate memory, or map files or devices into memory SYNOPSIS #include <sys/mman.h> void * mmap(void *addr, size_t len, int prot, int flags, int fd, off_t offset); DESCRIPTION The mmap() system call causes the pages starting at addr and continuing for at most len bytes to be mapped from the object described by fd, starting at byte offset offset.

22

[dds@istlab /usr/src/sys/vm]$ ls default_pager.c uma_int.h vm_page.c device_pager.c vm.h vm_page.h memguard.c vm_contig.c vm_pageout.c memguard.h vm_extern.h vm_pageout.h phys_pager.c vm_fault.c vm_pager.c pmap.h vm_glue.c vm_pager.h redzone.c vm_init.c vm_param.h redzone.h vm_kern.c vm_phys.c sg_pager.c vm_kern.h vm_phys.h swap_pager.c vm_map.c vm_reserv.c swap_pager.h vm_map.h vm_reserv.h uma.h vm_meter.c vm_unix.c uma_core.c vm_mmap.c vm_zeroidle.c uma_dbg.c vm_object.c vnode_pager.c uma_dbg.h vm_object.h vnode_pager.h

23

3

24

$ ls -lh sparse -rw-r--r-- 1 dds dds 500G Mar 19 20:32 sparse $ du -h sparse 28K sparse

4 διεργασία 1 διεργασία 2

φυσική μνήμη

r/o r/o

25

r/w r/w

διεργασία 1 διεργασία 2

φυσική μνήμη

read read

διεργασία 1 διεργασία 2

φυσική μνήμη

read r/w

αντίγραφο

διεργασία 1 διεργασία 2

φυσική μνήμη

26

5

C++

27

e.g. 1

CC-BY 2.5 Claudio Rocchini

28

01110010011 0111101101101011 0000101101110011 00101 // romane 01110010011 0111101101101011 0000101101110011 1010101110011 // romanus 01110010011 0111101101101011 10101011011000111010101110011 // romulus 01110010011 10101011000100110 0101011 0111001110011 // rubens 01110010011 10101011000100110 0101011 10010 // ruber 01110010011 10101011000100110 100101100011011 0111101101110 // rubicon 01110010011 10101011000100110 100101100011011 1010101101110011001000111 010101110011 // rubicundus

Κατασκευή

δομής

δίσκου

Δομή δίσκου

Κατάλογος

άρθρων

Δομή

μνήμης

Κατασκευή

δένδρου

ριζών

Αρχική

ιστοσελίδα

Ιστοσελίδα

με νέους

δεσμούς

wikipedialize

for (;;) { i = bitpos; // Loop until the end of the current node or the end of the word while (i < p->end && i < len * 8) { // Covering whole byte? if (i % 8 == 0 && i + 8 <= p->end && (i + 8) / 8 <= len && data[i / 8] == p->data[i / 8]) { i += 8; continue; } // Split point if (getbit(data, i) != getbit(p->data, i)) { // Node with the new data struct pnode *n = new_node(data + i / 8, i % 8, (len - i / 8) * 8, NULL, NULL, true); // Tail of the current node struct pnode *t = new_node(p->data + i / 8, i % 8, p->end - (i & ~7), p->zero, p->one, p->is_terminal); // Head of current node if (getbit(data, i)) *p2 = new_node(p->data, bitpos, i, t, n, false); else *p2 = new_node(p->data, bitpos, i, n, t, false); free(p); return; } i++; } // while

29

// Write the given node to the specified file, returning its file offset. // On return the file's offset is set to the first free byte. static long write_node(struct pnode *p, FILE *f) { long my_offset = ftell(f); size_t ret; if (p->one) { struct pnode_disk_one pdo; size_t dlen = datalen(p->end); long len = sizeof(pdo) + dlen; fseek(f, len, SEEK_CUR); pdo.h.type = dt_one; pdo.h.is_terminal = p->is_terminal; pdo.h.has_zero = (p->zero != NULL); pdo.h.has_one = true; pdo.h.begin = p->begin; pdo.h.end = p->end; if (p->zero) write_node(p->zero, f); pdo.one = write_node(p->one, f); long saved_offset = ftell(f); fseek(f, my_offset, SEEK_SET); fwrite(&pdo, 1, sizeof(pdo), f); fwrite(p->data, 1, dlen, f); fseek(f, saved_offset, SEEK_SET); return my_offset; } else {

$ zcat enwiki-latest-all-titles-in-ns0.gz | wc -c 106,237,053 $ wc -c enwiki.pt 144,657,286 enwiki.pt

30

$ curl http://www.kiosek.com/dostoevsky/library/crimeandpunishment.txt | perl -pe 's/[\r\n]/ /g' >crimeandpunishment.txt

31

$ wc crimeandpunishment.txt 0 203,273 1,462,661 crimeandpunishment.txt

$ time ./wpltest en en_US.UTF-8 ISO-8859-1 data/enwiki.pt <crimeandpunishment.txt >/dev/null

$ time ./wpltest en en_US.UTF-8 ISO-8859-1 data/enwiki.pt <crimeandpunishment.txt >/dev/null Checked 406,225 prefixes real 0m5.859s # Cold cache real 0m1.876s # Warm cache user 0m1.780s sys 0m0.090s

32

// Prevent memory alignment problems

memcpy(&end, &(p.h->end), sizeof(end)); while (i < end && i < len * 8) { if (i % 8 == 0 && i + 8 <= end && data[i / 8] == pdata[i / 8]) {

i += 8; prefix += 8;

continue; }

// Split point if (getbit(data, i) != getbit(pdata, i))

return best / 8; i++;

prefix++; }

if (i == end && p.h->is_terminal) best = prefix;

if (i == len * 8) return best / 8; // Move to next node

bitpos = end % 8; int covered = end / 8;

if (getbit(data, end)) { if (!p.h->has_one)

return best / 8; switch (p.h->type) {

case dt_both: p.h = (struct pnode_disk_head *)(base + p.b->one);

break; case dt_one:

p.h = (struct pnode_disk_head *)(base + p.o->one); break;

case dt_short: default: assert(0);

} } else {

if (!p.h->has_zero) return best / 8;

switch (p.h->type) { case dt_both:

p.h = (struct pnode_disk_head *)(base + p.b->zero); break;

case dt_one: // Advance to the end of this node

p.h = (struct pnode_disk_head *)((char *)p.h + sizeof(struct pnode_disk_one) + datalen(end)); break;

case dt_short: // Advance to the end of this node p.h = (struct pnode_disk_head *)((char *)p.h + sizeof(struct pnode_disk_short) + datalen(end));

break; default:

assert(0); }

}

/*

* You are not expected

* to understand this

*/

33

e.g. 2

The problem with wikipedia

34

Λίστα ακμών Δομή

δεδομένων

γράφου

Κατασκευή

γράφου

Κορυφές

αρχής, τέλους Διαδρομή

BFS

Δομή

δεδομένων

γράφου

Λίστα ακμών

Κατασκευή

γράφου

Κορυφές

αρχής, τέλους Διαδρομή

BFS

Δομή

δεδομένων

γράφου

35

Λίστα ακμών Δομή

δεδομένων

γράφου

Κατασκευή

γράφου

// Loop through all lines, // adding them to the graph while (std::getline(in, line)) { int split = line.find('\001'); if (split == std::string::npos) { std::cerr << "No separator: " << line << std::endl; continue; } n.setName(line.substr(0, split)); NodesIter from(entries->insert(n).first); n.setName(line.substr(split + 1)); NodesIter to(entries->insert(n).first); (const_cast<Node &>(*from)).addEdge( const_cast<Node *>(&*to)); }

Λίστα ακμών

Κατασκευή

γράφου

Κορυφές

αρχής, τέλους Διαδρομή

BFS

Δομή

δεδομένων

γράφου

36

Διαδρομή

BFS

Δομή

δεδομένων

γράφου

Tacoma Narrows Bridge

p=

Suspension bridge

p=

Washington

p=

Geneva

p=

William Howard Taft

p=

Montana

p=

Ουρά

[]=

Tacoma Narrows Bridge

p=

Suspension bridge

p=

Washington

p=

Geneva

p=

William Howard Taft

p=

Montana

p=

Ουρά

[]=Tacoma Narrow Bridge

37

Tacoma Narrows Bridge

p=

Suspension bridge

p=

Washington

p=

Geneva

p=

William Howard Taft

p=

Montana

p=

Ουρά

[]=Tacoma Narrow Bridge

Tacoma Narrows Bridge

p=

Suspension bridge

p=

Washington

p=

Geneva

p=

William Howard Taft

p=

Montana

p=

Ουρά

[]=

Tacoma Narrows Bridge

p=

Suspension bridge

p=Tacoma Narrows Bridge

Washington

p=

Geneva

p=

William Howard Taft

p=

Montana

p=

Ουρά

[]= Suspension bridge

38

Suspension bridge

p=Tacoma Narrows Bridge

Washington

p=Tacoma Narrows Bridge

Geneva

p=

William Howard Taft

p=

Montana

p=

Ουρά

[]= Suspension bridge

Washington

Tacoma Narrows Bridge

p=

Tacoma Narrows Bridge

p=

Suspension bridge

p=Tacoma Narrows Bridge

Washington

p=Tacoma Narrows Bridge

Geneva

p=

William Howard Taft

p=

Montana

p=

Ουρά

[]= Suspension bridge

Washington

Suspension bridge

p=Tacoma Narrows Bridge

Washington

p=Tacoma Narrows Bridge

Geneva

p=Suspension bridge

William Howard Taft

p=

Montana

p=

Ουρά

[]= Washington

Geneva

Tacoma Narrows Bridge

p=

39

Washington

p=Tacoma Narrows Bridge

Geneva

p=Suspension bridge

William Howard Taft

p=

Montana

p=Washington

Ουρά

[]= Geneva

Montana

Tacoma Narrows Bridge

p=

Suspension bridge

p=Tacoma Narrows Bridge

Washington

p=Tacoma Narrows Bridge

Geneva

p=Suspension bridge

William Howard Taft

p=

Montana

p=Washington

Ουρά

[]=Montana

Tacoma Narrows Bridge

p=

Suspension bridge

p=Tacoma Narrows Bridge

Washington

p=Tacoma Narrows Bridge

Geneva

p=Suspension bridge

William Howard Taft

p=Geneva

Montana

p=Washington

Ουρά

[]=Montana

Tacoma Narrows Bridge

p=

Suspension bridge

p=Tacoma Narrows Bridge

40

static bool breadthFirstSearchFor(NodePtr from, NodePtr to,

size_t n) {

std::queue<NodePtr> q;

from->setColor(Node::Gray);

q.push(from);

while (!q.empty()) {

NodePtr u = q.front();

q.pop();

const Edges edges = u->getEdges();

for (Edges::const_iterator j = edges.begin();

j != edges.end(); j++)

if ((*j)->getColor() == Node::White) {

(*j)->setColor(Node::Gray);

(*j)->setPredecessor(u);

if (*j == to)

return true; // Found

q.push(*j);

}

u->setColor(Node::Black);

}

return false; // Not found

}

Λίστα ακμών

Κατασκευή

γράφου

Κορυφές

αρχής, τέλους Διαδρομή

BFS

Δομή

δεδομένων

γράφου

Δομή

δεδομένων

γράφου

41

42

Δομή

δεδομένων

γράφου

#include <string> #include <iostream> #include <queue> #include <list> #include <functional> #include <boost/interprocess/managed_mapped_file.hpp> #include <boost/interprocess/offset_ptr.hpp> #include <boost/interprocess/allocators/allocator.hpp> #include <boost/unordered_set.hpp> #include <boost/interprocess/containers/string.hpp> #include <boost/interprocess/containers/slist.hpp> #include <boost/filesystem.hpp> #include <boost/filesystem/operations.hpp>

43

#include <string> #include <iostream> #include <queue> #include <list> #include <functional> #include <boost/interprocess/managed_mapped_file.hpp> #include <boost/interprocess/offset_ptr.hpp> #include <boost/interprocess/allocators/allocator.hpp> #include <boost/unordered_set.hpp> #include <boost/interprocess/containers/string.hpp> #include <boost/interprocess/containers/slist.hpp> #include <boost/filesystem.hpp> #include <boost/filesystem/operations.hpp>

typedef managed_mapped_file::segment_manager SegmentManager; typedef allocator<char, SegmentManager> CharAllocator; typedef basic_string<char, std::char_traits<char>, CharAllocator> CharString; typedef allocator<Node, SegmentManager> NodeAllocator; typedef boost::unordered_set<Node, boost::hash<Node>, NodeEqual, NodeAllocator> Nodes; typedef offset_ptr<Node> NodePtr; typedef allocator<NodePtr, SegmentManager> NodePtrAllocator; typedef slist<NodePtr, NodePtrAllocator> Edges; typedef allocator<void, SegmentManager> VoidAllocator; typedef allocator<Edges, SegmentManager> EdgesAllocator;

// A graph node, suitable for performing a breadh-first search class Node { public: typedef enum {White, Gray, Black} Color; private: CharString name; // Node name Color color; // Color used during BFS NodePtr predecessor; // BFS predecessor Edges edges; // Node's edges public: // Since VoidAllocator is convertible to any other // allocator<T>, we can simplify the initialization // taking just one allocator for all inner containers. Node(const std::string &n, const VoidAllocator &voidAlloc) : name(n.begin(), n.end(), voidAlloc), color(White), predecessor(NULL), edges(voidAlloc) {} void addEdge(NodePtr p) { edges.push_front(p); } };

44

/* * Read ^A-separated nodes from the inputFile, storing the graph * structure in the specified backingFile. */ static void readData(const char *backingFile, const char *inputFile) { std::ifstream in(inputFile, std::ios::binary); if (in.fail()) { perror(inputFile); exit(1); } boost::filesystem::remove_all(backingFile); managed_mapped_file segment(create_only, backingFile, FileSize); // An allocator convertible to any allocator<T, SegmentManager> type VoidAllocator allocInst (segment.get_segment_manager()); // Construct the memory map and fill it Nodes *entries = segment.construct<Nodes>("entries")(Elements, boost::hash<Node>(), NodeEqual(), allocInst); std::string line; Node n(std::string(), allocInst); // To save construction costs

/* * Search and report the shortest graph path from "from" to "to" * The graph is stored in backingFile. */ static void searchData(const char *backingFile, const std::string &from, const std::string &to) { managed_mapped_file segment(open_copy_on_write, backingFile); // An allocator convertible to any allocator<T, SegmentManager> VoidAllocator allocInst(segment.get_segment_manager()); // Obtain the previously saved entries Nodes *entries = segment.find<Nodes>("entries").first; NodePtr toPtr; bool found = breadthFirstSearchFor( findNode(entries, Node(from, allocInst)), toPtr = findNode(entries, Node(to, allocInst)), entries->size());

45

Λίστα ακμών

Κατασκευή

γράφου

Κορυφές

αρχής, τέλους Διαδρομή

BFS

Δομή

δεδομένων

γράφου

46

$ ./smap -r graph.bin graph.txt

$ ./smap -s graph.bin 'Tacoma Narrows Bridge'\ 'William howard taft' 0% 10 20 30 40 50 60 70 80 90 100% |----|----|----|----|----|----|----|----|----|----| *

Tacoma Narrows Bridge Washington Montana William howard taft

$ ./smap -s graph.bin 'Tacoma Narrows Bridge'\ '24-hour analog dial' 0% 10 20 30 40 50 60 70 80 90 100% |----|----|----|----|----|----|----|----|----|----| **

Tacoma Narrows Bridge Suspension bridge Geneva Watch 24-hour analog dial

47

$ ./smap -s graph.bin 'Tacoma Narrows Bridge' 'Wet t-shirt contest' 0% 10 20 30 40 50 60 70 80 90 100% |----|----|----|----|----|----|----|----|----|----| *

Tacoma Narrows Bridge Washington Starbucks Toplessness Wet t-shirt contest

The problem with wikipedia

48

Performance

MySQL mmap

Server 15:59:43

Client system 03:16:59 00:04:32

Client user 00:52:48 00:04:52

00:00

06:00

12:00

18:00

Χρ

όνο

ς(ω

:λ)

Κατασκευή δομής δεδομένων

MySQL mmap

Waiting 348 3.886

Server 259

Client system 58 19

Client user 16 2

0500

1.0001.5002.0002.5003.0003.5004.0004.500

Χρ

όνο

ς /

κό

μβ

ο (

μs)

Taft: Κρύα κρυφή μνήμη

49

MySQL mmap

Waiting 23 0

Server 305

Client system 59 5

Client user 15 3

050

100150200250300350400450

Χρ

όνο

ς/

κό

μβ

ο (

μs)

Taft: Ζεστή κρυφή μνήμη

MySQL mmap

Waiting 415 1.977

Server 472

Client system 103 10

Client user 26 4

0

500

1.000

1.500

2.000

2.500

Χρ

όνο

ς /

κό

μβ

ο (

μs)

24h Clock: Κρύα κρυφή μνήμη

MySQL mmap

Waiting 120 0

Server 469

Client system 103 3

Client user 27 4

0100200300400500600700800

Χρ

όνο

ς /

κό

μβ

ο (

μs)

24h Clock: Ζεστή κρυφή μνήμη

50

0

1

2

3

4

5

0 2000 4000 6000 8000

Χρ

όνο

ς(ρ

) /

κό

μβ

ο(m

s)

Αριθμός κόμβων

Χιλιάδες

Κλιμάκωση απόδοσης (κρύα μνήμη)

mmap

MySQL

51

ACID

A

52

C

I

D

53

SQL

54

A case…

Application code

vector<Customer> customers1;

Customer c1(d1,cd1,s1,p1);

customers1.push_back(c1);

vector<Truck> trucks;

Truck t1(cs1,dc1,pc1,rlp1, customers1);

trucks.push_back(t1);

….

ODBC

JDBC

55

register

L1 D cache

L2 cache

DRAM

HDD cache

HDD / SSD

L3 cache

56

534,681,000 εντολές ΚΜΕ

1

10

100

1,000

10,000

100,000

L1 D cache L2 cache DDR RAM Hard disk

Μέγ

ιστη

διε

κπ

ερα

ιωτι

κότη

τα

(MB

/s )

L1 D cache1.3 ns

L2 cache9.7 ns

DDR RAM28.5 ns

Hard disk25.6 ms

Χεί

ρισ

τη α

να

μονή (

λογ. κλιμ

.)

57

L1 D cache

1.3 ns

L2 cache

9.7 ns

DDR RAM

28.5 ns

Hard disk

25.6 ms

Χεί

ρισ

τη α

να

μονή (

λογ. κλιμ

.)

58

// Write the given node to the specified file, returning its file offset. // On return the file's offset is set to the first free byte. static long write_node(struct pnode *p, FILE *f) { long my_offset = ftell(f); size_t ret; if (p->one) { struct pnode_disk_one pdo; size_t dlen = datalen(p->end); long len = sizeof(pdo) + dlen; fseek(f, len, SEEK_CUR); pdo.h.type = dt_one; pdo.h.is_terminal = p->is_terminal; pdo.h.has_zero = (p->zero != NULL); pdo.h.has_one = true; pdo.h.begin = p->begin; pdo.h.end = p->end; if (p->zero) write_node(p->zero, f); pdo.one = write_node(p->one, f); long saved_offset = ftell(f); fseek(f, my_offset, SEEK_SET); fwrite(&pdo, 1, sizeof(pdo), f); fwrite(p->data, 1, dlen, f); fseek(f, saved_offset, SEEK_SET); return my_offset; } else {

59

#include <boost/interprocess/managed_mapped_file.hpp> #include <boost/interprocess/offset_ptr.hpp> #include <boost/interprocess/allocators/allocator.hpp> #include <boost/unordered_set.hpp> #include <boost/interprocess/containers/string.hpp> #include <boost/interprocess/containers/slist.hpp>

w r/ο

βήμα 1 βήμα Ν

φυσική μνήμη

read r/w

αντίγραφο

διεργασία 1 διεργασία 2

φυσική μνήμη

60

www.spinellis.gr

twitter.com/CoolSWEng

dds@aueb.gr

61

www.spinellis.gr/wpl

www.spinellis.gr/blog/20101030/smap.cpp