Using Graphics Cards to Break Passwords

Post on 25-May-2015

2.088 views 2 download

Transcript of Using Graphics Cards to Break Passwords

Using Graphics Cards to Break Passwords

Andrey Belenkoa.belenko@elcomsoft.com

!"#$%&'()"*

+,-.&/0,1%"23$3"(3,4",5/667%$&6,8,5!96

:3(3;<3$,=.>?,-@A@

BC3,D3E;3$,13"#3$?,F")G3$6)#0,%2,H3$*3"?,9%$7/0

I

J3,(/"K#,$3/EE0,#C)"L,%2,/"0<%&0,73,L"%7,7C%,&%36"K#,'63,M/667%$&6,%$,5!9,(%&36,2$3N'3"#E0O,

+E;%6#,3G3$0,&/0,)",2/(#O,5/667%$&6,8,5!96,/$3,3G3$07C3$3O,P/"0,03/$6,C/G3,M/663&,6)"(3,<3##3$,

/'#C3"#)(/#)%",#3(C"%E%*)36,73$3,;/&3,/G/)E/<E3?,6'(C,/6,-.2/(#%$,/'#C3"#)(/#)%",/"&,<)%;3#$)(6O,

D#)EE,73,C/G3,;%$3,/"&,;%$3,M/667%$&6,/"&,5!96,#%,$3;3;<3$O,JC0Q

BC/#,N'36#)%",M$%</<E0,7%"K#,<3,/"673$3&,C3$3O

JC/#,73,7)EE,&%,)6,#%,M$363"#,%"*%)"*,$363/$(C?,#3(C"%E%*)36,/"&,#3(C")N'36,#C/#,/)&6,)",#C3,

$3(%G3$0,%2,M/667%$&6O,B3(C"%E%*)36,#C/#,(/",<3,'63&,2%$,*%%&,.,/"&,2%$,3G)EO,J)#C,#3(C"%E%*)36,

6'(C,/6,R$/MC)(6,5$%(366)"*,F")#6,/"&,S/)"<%7,B/<E36,<3)"*,'#)E)T3&,#%,*$3/#E0,);M$%G3,$3(%G3$0,

6M33&6,%2,M/667%$&6?,$363/$(C3$6,/$3,"%7,#/EL)"*,/<%'#,A-.(C/$/(#3$,E3"*#C,M/667%$&6,/6,#C3,

;)");';,2%$,<3)"*,U63('$3VO

BC/#K6,/,<)*,6#3M,2$%;,7C/#,M3%ME3,/$3,'6)"*,#%&/0O,!"#$%&,W'6#,7%"K#,&%,/"0;%$3,/6,0%'$,M/667%$&O

BC3,'6/<)E)#0,/6M3(#,%2,M/667%$&6,/"&,5!96,/$3,<3(%;)"*,)"($3/6)"*E0,);M%$#/"#O,P/"0,03/$6,/*%,

#C3,!"#3$"3#,7/6,/,6/23,ME/(3,#%,<3O,J3,&)&"K#,$3/EE0,&%,;'(C,<'6)"366,%$,63($3#,6#'22,#C3$3O,

9%7/&/06,73,M/0,%'$,<)EE6?,M'$(C/63,"37,*/&*3#6,/"&,#/EL,#%,%'$,2/;)E0?,(%EE3/*'36,/"&,63($3#,

E%G3$6,I,$)*C#,#C3$3,%",#C3,!"#3$"3#O,D#)EE?,63('$)#0,/#,;/"0,6)#36,)6,/E;%6#,3"#)$3E0,E32#,)",#C3,C/"&6,%2,

#C3,3"&.'63$,I,"%,*'/$/"#336,/##/(C3&O,DM3/L)"*,%2,7C)(CX,7C3",&)&,0%',E/6#,(C/"*3,0%'$,5!96Q

I

J3,7%'E&,E)L3,#%,73E(%;3,0%',#%,'())*+,-).!/O,

1%G3$)"*,/##/(L6?,&323"636,/"&,'6/<)E)#0,%2,5/667%$&6,/"&,5!96O

I

!"#$%&'(#)#'%#$*+$+,&'+&)#-$./$0120#34'&

0*1223,45()52(67489())*+,-)!/

Why use GPUs?

Core i7 die layout

Transistor count: 1.17B

Memory Controller

IO &

QPI

IO &

QPI L3 Cache L3 Cache

Que

ue

CoreCore Core CoreCoreCore

Core i7 die layout

Transistor count: 1.17B

L2

L1

Exec

Pagi

ng

Bran

ch p

red.

Fetc

h &

L1

Sched.

Decode&

μ-codeMem.

Core i7 die layout

Transistor count: 1.17B

10%

90%

CPU dedicates 1/10 of resources to calculations

GTX 480 die layout

Transistor count: 3B

GTX 480 die layout

Transistor count: 3B

30%

70%

• GPU dedicates 1/3 of resources to calculations

• 2.5x more transistors than CPU

• 7x more computing power overall

PBKDF2-SHA1with 2000 iterations

i7-970

GTX 480

GTX 580

HD 5970

0K 50K 100K 150K 200K

195K

68K

60K

15.5K

How to use GPUs?

Basics

• GPUs are SIMD and excel at data-parallel tasks

• Program for GPU is called ‘kernel’

• Kernel runs in instances called threads

• Hardware takes care of thread scheduling

• Typical GPU has 100s of processors

• Need 1000s of threads to fully utilize GPU

ExampleC=A+B

void sum (int c[], int a[], int b[]) {int Index = getThreadId();c[Index] = a[Index] + b[Index];

}

Kernel:

int A[10], B[10], C[10];sum<<10>> (C, A, B);

Adding vectors:

ExampleMD5

void md5 (uint8 *dataIn, uint8 *dataOut) {int Index = getThreadId();uint8 *in = dataIn + MD5_BLOCK_SIZE * Index;uint8 *out = dataOut + MD5_HASH_SIZE * Index;MD5( dataOut, dataIn, MD5_BLOCK_SIZE );

}

Kernel:

uint8 Src[10 * MD5_BLOCK_SIZE];uint8 Dst[10 * MD5_HASH_SIZE];md5<<10>> (Src, Dst);

Computing hashes:

GPU Computing Stack

GPU Hardware

High-level Language

Intermediate Language

ISA

Optimization goes here

Translation, no optimizations

GPU Computing StackGPU world is bipolar

NVIDIA ATI

CUDA C, OpenCL OpenCL

PTX IL

Not documented Documented for RV700 (48xx)

G80 (8xxx) and up RV670 (38xx) and upHW

HLL

IL

ISA

Breaking passwordsthe CPU way

Generate password

H(p) Verify hash

Computing H(p) takes the most time, so offload it to the GPU

Breaking passwordsthe GPU way

CPU CPUGPU

Generate passwords

H(p)

Verify hashesH(p)

H(p)

...

Breaking passwordsthe GPU way

Generate passwords

Verify hashesH(p)

CPU CPUGPU

•If H(p) is fast, PCIe data transfers are the bottleneck•E.g. if H(p) is SHA-1, theoretical peak is ~200M p/s

Solution is to offload everything to GPU

Breaking passwordsthe GPU way

Generate passwords

Verify hashesH(p)

GPU GPUGPU

•If H(p) is fast, PCIe data transfers are the bottleneck•E.g. if H(p) is SHA-1, theoretical peak is ~200M p/s

Solution is to offload everything to GPU

How to use GPUs?Implementation considerations

GPU Computing Stack

NVIDIA ATI

CUDA C, OpenCL OpenCL

PTX IL

Not documented Documented for RV700 (48xx)

G80 (8xxx) and up RV670 (38xx) and upHW

HLL

IL

ISA

Choosing languageCUDA C vs. PTX

• C code translates into PTX without optimizations

• Optimization is done when compiling PTX

• Intrinsics for device-specific instructions

No real reason for developing in PTX

Choosing languageOpenCL

• Portability requires compilation at runtime

• May take significant time and resources

• Compiler is part of driver ➯ testing hell

• Requires source code in HLL ➯ IP issues

• Implementations are not complete and vary across vendors

Not mature enough

Choosing languageATI IL

• The only viable option if you love your users

• Access to device-specific instructions

• Best performance

• Not a an option if you love your developers

• Poor documentation, poor samples

• Meaningless compiler errors, no debugger

Achieving performance•Minimize data transfers

•Minimize memory accesses

•Or at least plan them carefully

•Minimize number of registers used

•Less registers used means more threads will run simultaneously

•Schedule enough threads to keep GPU processors busy

•Avoid thread divergence

Porting crypto to GPU

• Usually pretty straightforward

• MD5, SHA1 and alike require little to no changes

• Can be tricky sometimes

• RC4 requires many memory accesses, so careful layout is needed

• DES requires table lookups which are very expensive

Porting crypto to GPUThe DES

• Table lookups (s-boxes) are the bottleneck

• Avoid them by using bitslicing

• S-boxes replaced with logic functions

• 32 encryptions in parallel

• Requires many registers

• Performance depends on compiler heuristics

How to use GPUs?Real-world problems

ScalabilityNot all GPUs created equal

1. Program should scale nicely with the number of processors on GPU

• Query processor count from the driver

• Partition task accordinglynumThreads = F(numProcessors)

• Also helps to avoid triggering watchdog and freezing screen

Scalability8 GPUs in system are not uncommon

2. Program should scale nicely with the number of GPUs

• Query device count from the driver

• Spawn CPU threads to control each device

• Partition task accordingly

Speedup should be linear unless you hit PCIe limits

CompatibilityNot everyone’s got Fermi. Yet.

• New hardware offers great new features

• Cache on Fermi

• bitalign instruction on RV770

• May require different optimization strategy

• May require separate codebase

• Support for legacy hardware shouldn’t be dropped

Be prepared to handle this sort of complexity

Including GPU codeOption 1: include PTX/IL code in your program

Pros

•Recommended way

•Forward compatibility

•No hardware required

Cons

•Compilation at runtime

•Can’t test all hardware

•IP issues

Including GPU codeOption 2: include pre-compiled GPU binaries

Pros

•No dependency on users’ driver

•No compilation at runtime

•Better IP protection

Cons

•May not work with future devices

•Need to precompile for every supported GPU

•No precompiled binary for GPU = no support

Questions?

Thank you

Using Graphics Cards to Break Passwords

Andrey Belenkoa.belenko@elcomsoft.com

!"#$%&'()"*

+,-.&/0,1%"23$3"(3,4",5/667%$&6,8,5!96

:3(3;<3$,=.>?,-@A@

BC3,D3E;3$,13"#3$?,F")G3$6)#0,%2,H3$*3"?,9%$7/0

I

J3,(/"K#,$3/EE0,#C)"L,%2,/"0<%&0,73,L"%7,7C%,&%36"K#,'63,M/667%$&6,%$,5!9,(%&36,2$3N'3"#E0O,

+E;%6#,3G3$0,&/0,)",2/(#O,5/667%$&6,8,5!96,/$3,3G3$07C3$3O,P/"0,03/$6,C/G3,M/663&,6)"(3,<3##3$,

/'#C3"#)(/#)%",#3(C"%E%*)36,73$3,;/&3,/G/)E/<E3?,6'(C,/6,-.2/(#%$,/'#C3"#)(/#)%",/"&,<)%;3#$)(6O,

D#)EE,73,C/G3,;%$3,/"&,;%$3,M/667%$&6,/"&,5!96,#%,$3;3;<3$O,JC0Q

BC/#,N'36#)%",M$%</<E0,7%"K#,<3,/"673$3&,C3$3O

JC/#,73,7)EE,&%,)6,#%,M$363"#,%"*%)"*,$363/$(C?,#3(C"%E%*)36,/"&,#3(C")N'36,#C/#,/)&6,)",#C3,

$3(%G3$0,%2,M/667%$&6O,B3(C"%E%*)36,#C/#,(/",<3,'63&,2%$,*%%&,.,/"&,2%$,3G)EO,J)#C,#3(C"%E%*)36,

6'(C,/6,R$/MC)(6,5$%(366)"*,F")#6,/"&,S/)"<%7,B/<E36,<3)"*,'#)E)T3&,#%,*$3/#E0,);M$%G3,$3(%G3$0,

6M33&6,%2,M/667%$&6?,$363/$(C3$6,/$3,"%7,#/EL)"*,/<%'#,A-.(C/$/(#3$,E3"*#C,M/667%$&6,/6,#C3,

;)");';,2%$,<3)"*,U63('$3VO

BC/#K6,/,<)*,6#3M,2$%;,7C/#,M3%ME3,/$3,'6)"*,#%&/0O,!"#$%&,W'6#,7%"K#,&%,/"0;%$3,/6,0%'$,M/667%$&O

BC3,'6/<)E)#0,/6M3(#,%2,M/667%$&6,/"&,5!96,/$3,<3(%;)"*,)"($3/6)"*E0,);M%$#/"#O,P/"0,03/$6,/*%,

#C3,!"#3$"3#,7/6,/,6/23,ME/(3,#%,<3O,J3,&)&"K#,$3/EE0,&%,;'(C,<'6)"366,%$,63($3#,6#'22,#C3$3O,

9%7/&/06,73,M/0,%'$,<)EE6?,M'$(C/63,"37,*/&*3#6,/"&,#/EL,#%,%'$,2/;)E0?,(%EE3/*'36,/"&,63($3#,

E%G3$6,I,$)*C#,#C3$3,%",#C3,!"#3$"3#O,D#)EE?,63('$)#0,/#,;/"0,6)#36,)6,/E;%6#,3"#)$3E0,E32#,)",#C3,C/"&6,%2,

#C3,3"&.'63$,I,"%,*'/$/"#336,/##/(C3&O,DM3/L)"*,%2,7C)(CX,7C3",&)&,0%',E/6#,(C/"*3,0%'$,5!96Q

I

J3,7%'E&,E)L3,#%,73E(%;3,0%',#%,'())*+,-).!/O,

1%G3$)"*,/##/(L6?,&323"636,/"&,'6/<)E)#0,%2,5/667%$&6,/"&,5!96O

I

!"#$%&'(#)#'%#$*+$+,&'+&)#-$./$0120#34'&

0*1223,45()52(67489())*+,-)!/