MLlib: RDD-based – Otros ejemplos#
30 min | Última modificación: Noviembre 6, 2020
Inicialización de Spark#
[1]:
#
# Carga de las librerías de Spark
#
import findspark
from pyspark.sql import SparkSession
from pyspark import SparkConf, SparkContext
findspark.init()
APP_NAME = "spark-app"
conf = SparkConf().setAppName(APP_NAME)
sc = SparkContext(conf=conf)
spark = SparkSession(sc)
Clustering#
[2]:
#
# Descarga
#
!wget https://raw.githubusercontent.com/apache/spark/master/data/mllib/kmeans_data.txt
--2020-11-07 16:35:55-- https://raw.githubusercontent.com/apache/spark/master/data/mllib/kmeans_data.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 199.232.48.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|199.232.48.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 72 [text/plain]
Saving to: ‘kmeans_data.txt’
kmeans_data.txt 100%[===================>] 72 --.-KB/s in 0s
2020-11-07 16:35:56 (490 KB/s) - ‘kmeans_data.txt’ saved [72/72]
[3]:
#
# Contenido del archivo
#
!head kmeans_data.txt
0.0 0.0 0.0
0.1 0.1 0.1
0.2 0.2 0.2
9.0 9.0 9.0
9.1 9.1 9.1
9.2 9.2 9.2
[4]:
#
# Mueve el archivo de datos al hdfs
#
!hdfs dfs -copyFromLocal kmeans_data.txt /tmp/kmeans_data.txt
[5]:
from math import sqrt
from numpy import array
from pyspark.mllib.clustering import KMeans, KMeansModel
# Load and parse the data
data = sc.textFile("/tmp/kmeans_data.txt")
parsedData = data.map(lambda line: array([float(x) for x in line.split(" ")]))
# Build the model (cluster the data)
clusters = KMeans.train(parsedData, 2, maxIterations=10, initializationMode="random")
# Evaluate clustering by computing Within Set Sum of Squared Errors
def error(point):
center = clusters.centers[clusters.predict(point)]
return sqrt(sum([x**2 for x in (point - center)]))
WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y)
print("Within Set Sum of Squared Error = " + str(WSSSE))
clusters.save(sc, "/tmp/KMeansModel")
sameModel = KMeansModel.load(sc, "/tmp/KMeansModel")
Within Set Sum of Squared Error = 0.6928203230275529
Standard Scaler#
[6]:
#
# Descarga
#
!wget https://raw.githubusercontent.com/apache/spark/master/data/mllib/sample_libsvm_data.txt
--2020-11-07 16:36:20-- https://raw.githubusercontent.com/apache/spark/master/data/mllib/sample_libsvm_data.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 199.232.48.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|199.232.48.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 104736 (102K) [text/plain]
Saving to: ‘sample_libsvm_data.txt’
sample_libsvm_data. 100%[===================>] 102.28K 531KB/s in 0.2s
2020-11-07 16:36:21 (531 KB/s) - ‘sample_libsvm_data.txt’ saved [104736/104736]
[7]:
#
# Contenido del archivo
#
!head sample_libsvm_data.txt
0 128:51 129:159 130:253 131:159 132:50 155:48 156:238 157:252 158:252 159:252 160:237 182:54 183:227 184:253 185:252 186:239 187:233 188:252 189:57 190:6 208:10 209:60 210:224 211:252 212:253 213:252 214:202 215:84 216:252 217:253 218:122 236:163 237:252 238:252 239:252 240:253 241:252 242:252 243:96 244:189 245:253 246:167 263:51 264:238 265:253 266:253 267:190 268:114 269:253 270:228 271:47 272:79 273:255 274:168 290:48 291:238 292:252 293:252 294:179 295:12 296:75 297:121 298:21 301:253 302:243 303:50 317:38 318:165 319:253 320:233 321:208 322:84 329:253 330:252 331:165 344:7 345:178 346:252 347:240 348:71 349:19 350:28 357:253 358:252 359:195 372:57 373:252 374:252 375:63 385:253 386:252 387:195 400:198 401:253 402:190 413:255 414:253 415:196 427:76 428:246 429:252 430:112 441:253 442:252 443:148 455:85 456:252 457:230 458:25 467:7 468:135 469:253 470:186 471:12 483:85 484:252 485:223 494:7 495:131 496:252 497:225 498:71 511:85 512:252 513:145 521:48 522:165 523:252 524:173 539:86 540:253 541:225 548:114 549:238 550:253 551:162 567:85 568:252 569:249 570:146 571:48 572:29 573:85 574:178 575:225 576:253 577:223 578:167 579:56 595:85 596:252 597:252 598:252 599:229 600:215 601:252 602:252 603:252 604:196 605:130 623:28 624:199 625:252 626:252 627:253 628:252 629:252 630:233 631:145 652:25 653:128 654:252 655:253 656:252 657:141 658:37
1 159:124 160:253 161:255 162:63 186:96 187:244 188:251 189:253 190:62 214:127 215:251 216:251 217:253 218:62 241:68 242:236 243:251 244:211 245:31 246:8 268:60 269:228 270:251 271:251 272:94 296:155 297:253 298:253 299:189 323:20 324:253 325:251 326:235 327:66 350:32 351:205 352:253 353:251 354:126 378:104 379:251 380:253 381:184 382:15 405:80 406:240 407:251 408:193 409:23 432:32 433:253 434:253 435:253 436:159 460:151 461:251 462:251 463:251 464:39 487:48 488:221 489:251 490:251 491:172 515:234 516:251 517:251 518:196 519:12 543:253 544:251 545:251 546:89 570:159 571:255 572:253 573:253 574:31 597:48 598:228 599:253 600:247 601:140 602:8 625:64 626:251 627:253 628:220 653:64 654:251 655:253 656:220 681:24 682:193 683:253 684:220
1 125:145 126:255 127:211 128:31 152:32 153:237 154:253 155:252 156:71 180:11 181:175 182:253 183:252 184:71 209:144 210:253 211:252 212:71 236:16 237:191 238:253 239:252 240:71 264:26 265:221 266:253 267:252 268:124 269:31 293:125 294:253 295:252 296:252 297:108 322:253 323:252 324:252 325:108 350:255 351:253 352:253 353:108 378:253 379:252 380:252 381:108 406:253 407:252 408:252 409:108 434:253 435:252 436:252 437:108 462:255 463:253 464:253 465:170 490:253 491:252 492:252 493:252 494:42 518:149 519:252 520:252 521:252 522:144 546:109 547:252 548:252 549:252 550:144 575:218 576:253 577:253 578:255 579:35 603:175 604:252 605:252 606:253 607:35 631:73 632:252 633:252 634:253 635:35 659:31 660:211 661:252 662:253 663:35
1 153:5 154:63 155:197 181:20 182:254 183:230 184:24 209:20 210:254 211:254 212:48 237:20 238:254 239:255 240:48 265:20 266:254 267:254 268:57 293:20 294:254 295:254 296:108 321:16 322:239 323:254 324:143 350:178 351:254 352:143 378:178 379:254 380:143 406:178 407:254 408:162 434:178 435:254 436:240 462:113 463:254 464:240 490:83 491:254 492:245 493:31 518:79 519:254 520:246 521:38 547:214 548:254 549:150 575:144 576:241 577:8 603:144 604:240 605:2 631:144 632:254 633:82 659:230 660:247 661:40 687:168 688:209 689:31
1 152:1 153:168 154:242 155:28 180:10 181:228 182:254 183:100 209:190 210:254 211:122 237:83 238:254 239:162 265:29 266:254 267:248 268:25 293:29 294:255 295:254 296:103 321:29 322:254 323:254 324:109 349:29 350:254 351:254 352:109 377:29 378:254 379:254 380:109 405:29 406:255 407:254 408:109 433:29 434:254 435:254 436:109 461:29 462:254 463:254 464:63 489:29 490:254 491:254 492:28 517:29 518:254 519:254 520:28 545:29 546:254 547:254 548:35 573:29 574:254 575:254 576:109 601:6 602:212 603:254 604:109 630:203 631:254 632:178 658:155 659:254 660:190 686:32 687:199 688:104
0 130:64 131:253 132:255 133:63 157:96 158:205 159:251 160:253 161:205 162:111 163:4 184:96 185:189 186:251 187:251 188:253 189:251 190:251 191:31 209:16 210:64 211:223 212:244 213:251 214:251 215:211 216:213 217:251 218:251 219:31 236:80 237:181 238:251 239:253 240:251 241:251 242:251 243:94 244:96 245:251 246:251 247:31 263:92 264:253 265:253 266:253 267:255 268:253 269:253 270:253 271:95 272:96 273:253 274:253 275:31 290:92 291:236 292:251 293:243 294:220 295:233 296:251 297:251 298:243 299:82 300:96 301:251 302:251 303:31 317:80 318:253 319:251 320:251 321:188 323:96 324:251 325:251 326:109 328:96 329:251 330:251 331:31 344:96 345:240 346:253 347:243 348:188 349:42 351:96 352:204 353:109 354:4 356:12 357:197 358:251 359:31 372:221 373:251 374:253 375:121 379:36 380:23 385:190 386:251 387:31 399:48 400:234 401:253 413:191 414:253 415:31 426:44 427:221 428:251 429:251 440:12 441:197 442:251 443:31 454:190 455:251 456:251 457:251 468:96 469:251 470:251 471:31 482:190 483:251 484:251 485:113 495:40 496:234 497:251 498:219 499:23 510:190 511:251 512:251 513:94 522:40 523:217 524:253 525:231 526:47 538:191 539:253 540:253 541:253 548:12 549:174 550:253 551:253 552:219 553:39 566:67 567:236 568:251 569:251 570:191 571:190 572:111 573:72 574:190 575:191 576:197 577:251 578:243 579:121 580:39 595:63 596:236 597:251 598:253 599:251 600:251 601:251 602:251 603:253 604:251 605:188 606:94 624:27 625:129 626:253 627:251 628:251 629:251 630:251 631:229 632:168 633:15 654:95 655:212 656:251 657:211 658:94 659:59
1 159:121 160:254 161:136 186:13 187:230 188:253 189:248 190:99 213:4 214:118 215:253 216:253 217:225 218:42 241:61 242:253 243:253 244:253 245:74 268:32 269:206 270:253 271:253 272:186 273:9 296:211 297:253 298:253 299:239 300:69 324:254 325:253 326:253 327:133 351:142 352:255 353:253 354:186 355:8 378:149 379:229 380:254 381:207 382:21 405:54 406:229 407:253 408:254 409:105 433:152 434:254 435:254 436:213 437:26 460:112 461:251 462:253 463:253 464:26 487:29 488:212 489:253 490:250 491:149 514:36 515:214 516:253 517:253 518:137 542:75 543:253 544:253 545:253 546:59 570:93 571:253 572:253 573:189 574:17 598:224 599:253 600:253 601:84 625:43 626:235 627:253 628:126 629:1 653:99 654:248 655:253 656:119 682:225 683:235 684:49
1 100:166 101:222 102:55 128:197 129:254 130:218 131:5 155:29 156:249 157:254 158:254 159:9 183:45 184:254 185:254 186:174 187:2 210:4 211:164 212:254 213:254 214:85 238:146 239:254 240:254 241:254 242:85 265:101 266:245 267:254 268:254 269:254 270:85 292:97 293:248 294:254 295:204 296:254 297:254 298:85 315:12 316:59 317:98 318:151 319:237 320:254 321:254 322:109 323:35 324:254 325:254 326:85 343:41 344:216 345:254 346:254 347:239 348:153 349:37 350:4 351:32 352:254 353:254 354:85 372:7 373:44 374:44 375:30 379:32 380:254 381:254 382:96 407:19 408:230 409:254 410:174 436:197 437:254 438:110 464:197 465:254 466:85 492:197 493:253 494:63 515:37 516:54 517:54 518:45 519:26 520:84 521:221 522:84 523:21 524:31 525:162 526:78 540:6 541:41 542:141 543:244 544:254 545:254 546:248 547:236 548:254 549:254 550:254 551:233 552:239 553:254 554:138 567:23 568:167 569:254 570:254 571:254 572:254 573:229 574:228 575:185 576:138 577:138 578:138 579:138 580:138 581:138 582:44 595:113 596:254 597:254 598:254 599:179 600:64 601:5 623:32 624:209 625:183 626:97
0 155:53 156:255 157:253 158:253 159:253 160:124 183:180 184:253 185:251 186:251 187:251 188:251 189:145 190:62 209:32 210:217 211:241 212:253 213:251 214:251 215:251 216:251 217:253 218:107 237:37 238:251 239:251 240:253 241:251 242:251 243:251 244:251 245:253 246:107 265:166 266:251 267:251 268:253 269:251 270:96 271:148 272:251 273:253 274:107 291:73 292:253 293:253 294:253 295:253 296:130 299:110 300:253 301:255 302:108 319:73 320:251 321:251 322:251 323:251 327:109 328:251 329:253 330:107 347:202 348:251 349:251 350:251 351:225 354:6 355:129 356:251 357:253 358:107 375:150 376:251 377:251 378:251 379:71 382:115 383:251 384:251 385:253 386:107 403:253 404:251 405:251 406:173 407:20 410:217 411:251 412:251 413:253 414:107 430:182 431:255 432:253 433:216 438:218 439:253 440:253 441:182 457:63 458:221 459:253 460:251 461:215 465:84 466:236 467:251 468:251 469:77 485:109 486:251 487:253 488:251 489:215 492:11 493:160 494:251 495:251 496:96 513:109 514:251 515:253 516:251 517:137 520:150 521:251 522:251 523:251 524:71 541:109 542:251 543:253 544:251 545:35 547:130 548:253 549:251 550:251 551:173 552:20 569:110 570:253 571:255 572:253 573:98 574:150 575:253 576:255 577:253 578:164 597:109 598:251 599:253 600:251 601:251 602:251 603:251 604:253 605:251 606:35 625:93 626:241 627:253 628:251 629:251 630:251 631:251 632:216 633:112 634:5 654:103 655:253 656:251 657:251 658:251 659:251 683:124 684:251 685:225 686:71 687:71
0 128:73 129:253 130:227 131:73 132:21 156:73 157:251 158:251 159:251 160:174 182:16 183:166 184:228 185:251 186:251 187:251 188:122 210:62 211:220 212:253 213:251 214:251 215:251 216:251 217:79 238:79 239:231 240:253 241:251 242:251 243:251 244:251 245:232 246:77 264:145 265:253 266:253 267:253 268:255 269:253 270:253 271:253 272:253 273:255 274:108 292:144 293:251 294:251 295:251 296:253 297:168 298:107 299:169 300:251 301:253 302:189 303:20 318:27 319:89 320:236 321:251 322:235 323:215 324:164 325:15 326:6 327:129 328:251 329:253 330:251 331:35 345:47 346:211 347:253 348:251 349:251 350:142 354:37 355:251 356:251 357:253 358:251 359:35 373:109 374:251 375:253 376:251 377:251 378:142 382:11 383:148 384:251 385:253 386:251 387:164 400:11 401:150 402:253 403:255 404:211 405:25 410:11 411:150 412:253 413:255 414:211 415:25 428:140 429:251 430:251 431:253 432:107 438:37 439:251 440:251 441:211 442:46 456:190 457:251 458:251 459:253 460:128 461:5 466:37 467:251 468:251 469:51 484:115 485:251 486:251 487:253 488:188 489:20 492:32 493:109 494:129 495:251 496:173 497:103 512:217 513:251 514:251 515:201 516:30 520:73 521:251 522:251 523:251 524:71 540:166 541:253 542:253 543:255 544:149 545:73 546:150 547:253 548:255 549:253 550:253 551:143 568:140 569:251 570:251 571:253 572:251 573:251 574:251 575:251 576:253 577:251 578:230 579:61 596:190 597:251 598:251 599:253 600:251 601:251 602:251 603:251 604:242 605:215 606:55 624:21 625:189 626:251 627:253 628:251 629:251 630:251 631:173 632:103 653:31 654:200 655:253 656:251 657:96 658:71 659:20
[8]:
#
# Mueve el archivo de datos al hdfs
#
!hdfs dfs -copyFromLocal sample_libsvm_data.txt /tmp/sample_libsvm_data.txt
[9]:
from pyspark.mllib.feature import StandardScaler
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.util import MLUtils
data = MLUtils.loadLibSVMFile(sc, "/tmp/sample_libsvm_data.txt")
label = data.map(lambda x: x.label)
features = data.map(lambda x: x.features)
scaler1 = StandardScaler().fit(features)
scaler2 = StandardScaler(withMean=True, withStd=True).fit(features)
# data1 will be unit variance.
data1 = label.zip(scaler1.transform(features))
# data2 will be unit variance and zero mean.
data2 = label.zip(scaler2.transform(features.map(lambda x: Vectors.dense(x.toArray()))))
data2.collect()[:2]
[9]:
[(0.0,
DenseVector([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.1, -0.1, -0.1357, -0.1287, -0.1584, -0.1689, -0.1934, -0.1068, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.1, -0.1, -0.1338, -0.136, -0.2777, -0.4448, -0.5367, -0.6242, -0.1413, 0.8975, 1.6834, 1.0786, 0.2778, -0.2784, -0.1424, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.1, -0.1, -0.1238, -0.1741, -0.2875, -0.436, -0.6436, -0.804, -0.5563, 0.9701, 1.0756, 1.0633, 1.2936, 1.5017, -0.4409, -0.2643, -0.1677, -0.1, -0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.1, -0.1508, -0.1988, -0.3025, -0.4146, -0.5285, -0.7426, -0.4867, 0.8363, 0.9642, 0.8776, 0.7981, 0.8759, 1.3475, 0.0532, -0.2638, -0.2351, -0.1, -0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.1473, -0.2123, -0.293, -0.4025, -0.4646, -0.5713, -0.3357, 0.8337, 0.95, 0.9619, 0.8119, 0.3863, -0.4976, 1.1607, 1.6697, 1.1153, -0.2902, -0.1147, -0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.1, -0.1737, -0.2422, -0.3911, -0.4552, -0.6045, 0.7703, 1.2783, 1.0811, 1.0242, 1.0347, 0.9802, 0.9655, -0.316, 0.6902, 1.5056, 1.2792, -0.3696, -0.1914, -0.1051, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.138, -0.2088, -0.3229, -0.4268, -0.5353, -0.2181, 1.2681, 1.352, 1.2355, 0.511, -0.174, 1.091, 1.0527, -0.497, -0.1119, 1.4806, 1.0511, -0.4797, -0.2665, -0.139, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.1815, -0.2766, -0.3899, -0.4813, -0.1714, 1.4931, 1.6222, 1.465, 0.6979, -1.1195, -0.5165, 0.1072, -0.5779, -0.7185, -0.6714, 1.5629, 1.6563, 0.0125, -0.372, -0.1704, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.2219, -0.3518, -0.4713, -0.1571, 0.923, 1.6792, 1.5087, 1.3118, -0.1217, -1.1901, -1.1254, -0.818, -0.6479, -0.5726, -0.5227, 1.8384, 1.7465, 1.1504, -0.4304, -0.2483, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.2843, -0.4402, -0.4486, 1.1265, 1.6053, 1.6179, 0.1574, -0.3971, -0.6615, -1.1778, -1.0609, -0.7651, -0.5743, -0.4355, -0.439, 1.9493, 1.7853, 1.3685, -0.4806, -0.2961, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.3259, -0.481, 0.027, 1.7985, 1.6685, 0.0692, -0.4706, -0.6015, -1.0855, -1.1359, -1.0485, -0.693, -0.4395, -0.3876, -0.4275, 1.9906, 1.7809, 1.2915, -0.4895, -0.3128, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.1, -0.3648, -0.52, 1.3905, 1.6753, 1.2655, -0.4698, -0.4063, -0.6375, -1.0602, -1.1191, -1.034, -0.558, -0.3725, -0.3325, -0.455, 2.033, 1.825, 1.4226, -0.4758, -0.3462, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.1157, -0.4181, 0.2275, 1.734, 1.595, 0.5263, -0.434, -0.4101, -0.7107, -1.0713, -1.1114, -0.9221, -0.4328, -0.2785, -0.3406, -0.4726, 1.9588, 1.8771, 1.0205, -0.4533, -0.3306, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.1415, -0.448, 0.2602, 1.6644, 1.4284, -0.3049, -0.4354, -0.507, -0.7449, -1.0441, -1.0922, -0.7898, -0.3971, -0.239, -0.3188, 0.9086, 1.9452, 1.2092, -0.3608, -0.4134, -0.2729, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.1694, -0.456, 0.2275, 1.6294, 1.3892, -0.5733, -0.5166, -0.5899, -0.7526, -1.0396, -1.026, -0.6517, -0.4169, -0.2134, 0.9901, 1.8713, 1.5363, 0.1588, -0.4628, -0.376, -0.2433, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.2164, -0.4189, 0.2681, 1.5761, 0.6285, -0.6115, -0.6262, -0.6558, -0.813, -1.0641, -0.8976, -0.6301, 0.177, 1.4175, 1.7782, 0.9774, -0.6336, -0.494, -0.412, -0.3218, -0.2188, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.1784, -0.3798, 0.3457, 1.5867, 1.2237, -0.7649, -0.8005, -0.804, -0.9076, -1.0231, -0.9076, 0.4239, 1.8329, 1.684, 0.7991, -0.7001, -0.5941, -0.4748, -0.3513, -0.2637, -0.1811, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.1344, -0.3229, 0.5249, 1.7598, 1.3422, 0.3071, -0.6148, -0.8446, -0.4383, 0.4311, 0.9389, 1.414, 1.1825, 0.6781, -0.2101, -0.6059, -0.5058, -0.4191, -0.3035, -0.207, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.1, -0.1862, 0.9773, 1.9703, 1.404, 1.0334, 0.7065, 0.5383, 0.9214, 0.951, 0.9818, 0.5838, 0.1799, -0.7994, -0.668, -0.5031, -0.4184, -0.3189, -0.2282, -0.1715, 0.0, 0.0, -0.1, 0.0, 0.0, 0.0, 0.0, 0.0, -0.1, -0.1277, 0.4065, 2.2617, 1.7001, 1.1174, 0.8526, 0.7972, 0.9024, 0.6975, -0.0193, -1.0764, -0.8589, -0.6716, -0.5148, -0.4466, -0.3258, -0.2422, -0.1195, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.1, -0.156, 0.2961, 1.2624, 1.5267, 1.1794, 1.1322, 0.24, -0.6698, -0.9531, -0.7473, -0.5558, -0.4777, -0.3928, -0.3003, -0.1981, -0.1433, -0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.142, -0.1421, -0.1612, -0.3261, -0.3703, -0.3936, -0.3809, -0.3466, -0.4038, -0.3488, -0.2193, -0.1792, -0.1559, -0.1345])),
(1.0,
DenseVector([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.1, -0.1, -0.1357, -0.1287, -0.1584, -0.1689, -0.1934, -0.1068, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.1, -0.1, -0.1338, -0.136, -0.2777, -0.4448, -0.5367, -0.6242, -0.6881, -0.6948, -0.752, -0.6295, -0.4556, -0.2784, -0.1424, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.1, -0.1, -0.1238, -0.1741, -0.2875, -0.436, -0.6436, -0.804, -0.9908, -1.1284, -1.1807, -1.1735, 0.1625, 1.654, 2.9437, 1.456, -0.1677, -0.1, -0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.1, -0.1508, -0.1988, -0.3025, -0.4146, -0.5285, -0.7426, -0.958, -1.2212, -1.3677, -1.4986, -0.4726, 0.9725, 1.3385, 2.2328, 0.6899, -0.2351, -0.1, -0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.1473, -0.2123, -0.293, -0.4025, -0.4646, -0.6769, -0.8752, -1.0931, -1.2883, -1.3399, -1.5449, -0.2822, 0.9171, 1.1518, 1.6697, 0.3318, -0.2902, -0.1147, -0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.1, -0.1737, -0.2422, -0.3911, -0.4552, -0.6045, -0.769, -0.9098, -1.0683, -1.2682, -1.3542, -0.7106, 0.8217, 1.0475, 0.8892, -0.4683, -0.493, -0.3696, -0.1914, -0.1051, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.138, -0.2088, -0.3229, -0.4268, -0.5353, -0.7243, -0.8115, -0.868, -0.9635, -1.2188, -0.687, 0.8757, 1.2579, 1.4095, 0.0251, -0.7371, -0.6254, -0.4797, -0.2665, -0.139, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.1815, -0.2766, -0.3899, -0.4813, -0.6507, -0.7208, -0.7347, -0.7634, -0.9343, -1.2283, 0.2124, 1.2429, 1.5667, 1.3768, -0.6714, -0.7142, -0.652, -0.527, -0.372, -0.1704, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.2219, -0.3518, -0.4713, -0.5538, -0.6829, -0.6746, -0.6448, -0.6716, -0.9234, -1.0115, 1.0918, 1.4277, 1.7456, 0.296, -0.5227, -0.6557, -0.6196, -0.5969, -0.4304, -0.2483, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.2843, -0.4402, -0.5248, -0.6341, -0.6991, -0.6347, -0.5646, -0.6033, -0.6222, 0.5926, 1.0747, 1.5684, 0.9485, -0.4355, -0.439, -0.6253, -0.6184, -0.5921, -0.4806, -0.2961, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.3259, -0.481, -0.564, -0.6504, -0.663, -0.5721, -0.4706, -0.6015, -0.0749, 0.9006, 1.0832, 1.1439, -0.1962, -0.3876, -0.4275, -0.6033, -0.6387, -0.6072, -0.4895, -0.3128, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.1, -0.3648, -0.52, -0.6111, -0.658, -0.6325, -0.4698, -0.4063, 0.4695, 1.0347, 0.8996, 0.6695, -0.2838, -0.3725, -0.3325, -0.455, -0.5868, -0.6275, -0.5629, -0.4758, -0.3462, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.1157, -0.4181, -0.5555, -0.6497, -0.6862, -0.6172, -0.434, 0.2464, 2.1708, 1.0379, 0.925, 0.5756, -0.4328, -0.2785, -0.3406, -0.4726, -0.6157, -0.5952, -0.5511, -0.4533, -0.3306, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.1415, -0.448, -0.5852, -0.6889, -0.6917, -0.5677, -0.4354, 1.7949, 1.7153, 1.0296, 0.9892, -0.4011, -0.3971, -0.239, -0.4153, -0.5387, -0.6286, -0.615, -0.4975, -0.4134, -0.2729, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.1694, -0.456, -0.6004, -0.7124, -0.6809, -0.5733, 0.1425, 1.8975, 1.4928, 1.0449, 0.4887, -0.6517, -0.4169, -0.3468, -0.5009, -0.6209, -0.6878, -0.5856, -0.4628, -0.376, -0.2433, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.2164, -0.4189, -0.5748, -0.7379, -0.7196, -0.6115, 2.2104, 1.7714, 1.4063, 0.6537, -0.7917, -0.6301, -0.4706, -0.5488, -0.6097, -0.7001, -0.6336, -0.494, -0.412, -0.3218, -0.2188, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.1784, -0.3798, -0.5444, -0.7182, -0.7969, -0.7649, 1.5482, 1.4498, 1.3441, -0.2079, -0.9076, -0.7172, -0.6769, -0.6712, -0.7123, -0.7001, -0.5941, -0.4748, -0.3513, -0.2637, -0.1811, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.1344, -0.3229, -0.4823, -0.6564, -0.8607, 0.4194, 1.1844, 1.0708, 1.1535, -0.9426, -1.1122, -0.9351, -0.8269, -0.7878, -0.7512, -0.6059, -0.5058, -0.4191, -0.3035, -0.207, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.1, -0.1862, -0.429, -0.6285, -0.4065, 0.8263, 0.9158, 0.8243, -0.1136, -1.2676, -1.2193, -1.0602, -0.9352, -0.7994, -0.668, -0.5031, -0.4184, -0.3189, -0.2282, -0.1715, 0.0, 0.0, -0.1, 0.0, 0.0, 0.0, 0.0, 0.0, -0.1, -0.1277, -0.2734, -0.5009, -0.1365, 1.1085, 0.8526, 0.5022, -1.4036, -1.4192, -1.288, -1.0764, -0.8589, -0.6716, -0.5148, -0.4466, -0.3258, -0.2422, -0.1195, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.1, -0.156, -0.2961, 0.3738, 1.5174, 1.1794, 0.8464, -1.0121, -1.0117, -0.9531, -0.7473, -0.5558, -0.4777, -0.3928, -0.3003, -0.1981, -0.1433, -0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.142, -0.1421, 0.5128, 2.8814, 3.1809, 2.5577, -0.3809, -0.3466, -0.4038, -0.3488, -0.2193, -0.1792, -0.1559, -0.1345]))]
Normalizer#
[10]:
from pyspark.mllib.feature import Normalizer
from pyspark.mllib.util import MLUtils
data = MLUtils.loadLibSVMFile(sc, "/tmp/sample_libsvm_data.txt")
labels = data.map(lambda x: x.label)
features = data.map(lambda x: x.features)
normalizer1 = Normalizer()
normalizer2 = Normalizer(p=float("inf"))
# Each sample in data1 will be normalized using $L^2$ norm.
data1 = labels.zip(normalizer1.transform(features))
# Each sample in data2 will be normalized using $L^\infty$ norm.
data2 = labels.zip(normalizer2.transform(features))
data2.collect()[:2]
[10]:
[(0.0,
SparseVector(692, {127: 0.2, 128: 0.6235, 129: 0.9922, 130: 0.6235, 131: 0.1961, 154: 0.1882, 155: 0.9333, 156: 0.9882, 157: 0.9882, 158: 0.9882, 159: 0.9294, 181: 0.2118, 182: 0.8902, 183: 0.9922, 184: 0.9882, 185: 0.9373, 186: 0.9137, 187: 0.9882, 188: 0.2235, 189: 0.0235, 207: 0.0392, 208: 0.2353, 209: 0.8784, 210: 0.9882, 211: 0.9922, 212: 0.9882, 213: 0.7922, 214: 0.3294, 215: 0.9882, 216: 0.9922, 217: 0.4784, 235: 0.6392, 236: 0.9882, 237: 0.9882, 238: 0.9882, 239: 0.9922, 240: 0.9882, 241: 0.9882, 242: 0.3765, 243: 0.7412, 244: 0.9922, 245: 0.6549, 262: 0.2, 263: 0.9333, 264: 0.9922, 265: 0.9922, 266: 0.7451, 267: 0.4471, 268: 0.9922, 269: 0.8941, 270: 0.1843, 271: 0.3098, 272: 1.0, 273: 0.6588, 289: 0.1882, 290: 0.9333, 291: 0.9882, 292: 0.9882, 293: 0.702, 294: 0.0471, 295: 0.2941, 296: 0.4745, 297: 0.0824, 300: 0.9922, 301: 0.9529, 302: 0.1961, 316: 0.149, 317: 0.6471, 318: 0.9922, 319: 0.9137, 320: 0.8157, 321: 0.3294, 328: 0.9922, 329: 0.9882, 330: 0.6471, 343: 0.0275, 344: 0.698, 345: 0.9882, 346: 0.9412, 347: 0.2784, 348: 0.0745, 349: 0.1098, 356: 0.9922, 357: 0.9882, 358: 0.7647, 371: 0.2235, 372: 0.9882, 373: 0.9882, 374: 0.2471, 384: 0.9922, 385: 0.9882, 386: 0.7647, 399: 0.7765, 400: 0.9922, 401: 0.7451, 412: 1.0, 413: 0.9922, 414: 0.7686, 426: 0.298, 427: 0.9647, 428: 0.9882, 429: 0.4392, 440: 0.9922, 441: 0.9882, 442: 0.5804, 454: 0.3333, 455: 0.9882, 456: 0.902, 457: 0.098, 466: 0.0275, 467: 0.5294, 468: 0.9922, 469: 0.7294, 470: 0.0471, 482: 0.3333, 483: 0.9882, 484: 0.8745, 493: 0.0275, 494: 0.5137, 495: 0.9882, 496: 0.8824, 497: 0.2784, 510: 0.3333, 511: 0.9882, 512: 0.5686, 520: 0.1882, 521: 0.6471, 522: 0.9882, 523: 0.6784, 538: 0.3373, 539: 0.9922, 540: 0.8824, 547: 0.4471, 548: 0.9333, 549: 0.9922, 550: 0.6353, 566: 0.3333, 567: 0.9882, 568: 0.9765, 569: 0.5725, 570: 0.1882, 571: 0.1137, 572: 0.3333, 573: 0.698, 574: 0.8824, 575: 0.9922, 576: 0.8745, 577: 0.6549, 578: 0.2196, 594: 0.3333, 595: 0.9882, 596: 0.9882, 597: 0.9882, 598: 0.898, 599: 0.8431, 600: 0.9882, 601: 0.9882, 602: 0.9882, 603: 0.7686, 604: 0.5098, 622: 0.1098, 623: 0.7804, 624: 0.9882, 625: 0.9882, 626: 0.9922, 627: 0.9882, 628: 0.9882, 629: 0.9137, 630: 0.5686, 651: 0.098, 652: 0.502, 653: 0.9882, 654: 0.9922, 655: 0.9882, 656: 0.5529, 657: 0.1451})),
(1.0,
SparseVector(692, {158: 0.4863, 159: 0.9922, 160: 1.0, 161: 0.2471, 185: 0.3765, 186: 0.9569, 187: 0.9843, 188: 0.9922, 189: 0.2431, 213: 0.498, 214: 0.9843, 215: 0.9843, 216: 0.9922, 217: 0.2431, 240: 0.2667, 241: 0.9255, 242: 0.9843, 243: 0.8275, 244: 0.1216, 245: 0.0314, 267: 0.2353, 268: 0.8941, 269: 0.9843, 270: 0.9843, 271: 0.3686, 295: 0.6078, 296: 0.9922, 297: 0.9922, 298: 0.7412, 322: 0.0784, 323: 0.9922, 324: 0.9843, 325: 0.9216, 326: 0.2588, 349: 0.1255, 350: 0.8039, 351: 0.9922, 352: 0.9843, 353: 0.4941, 377: 0.4078, 378: 0.9843, 379: 0.9922, 380: 0.7216, 381: 0.0588, 404: 0.3137, 405: 0.9412, 406: 0.9843, 407: 0.7569, 408: 0.0902, 431: 0.1255, 432: 0.9922, 433: 0.9922, 434: 0.9922, 435: 0.6235, 459: 0.5922, 460: 0.9843, 461: 0.9843, 462: 0.9843, 463: 0.1529, 486: 0.1882, 487: 0.8667, 488: 0.9843, 489: 0.9843, 490: 0.6745, 514: 0.9176, 515: 0.9843, 516: 0.9843, 517: 0.7686, 518: 0.0471, 542: 0.9922, 543: 0.9843, 544: 0.9843, 545: 0.349, 569: 0.6235, 570: 1.0, 571: 0.9922, 572: 0.9922, 573: 0.1216, 596: 0.1882, 597: 0.8941, 598: 0.9922, 599: 0.9686, 600: 0.549, 601: 0.0314, 624: 0.251, 625: 0.9843, 626: 0.9922, 627: 0.8627, 652: 0.251, 653: 0.9843, 654: 0.9922, 655: 0.8627, 680: 0.0941, 681: 0.7569, 682: 0.9922, 683: 0.8627}))]