You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
<figcaptionstyle="text-align: center;">Table 3: Refactoring results for LIBRARIAN (w/ K = 8) averaged over 10 Code Contests collections</figcaption>
354
354
</figure>
355
355
356
+
<figureclass="table-figure">
357
+
<tableclass="table-styled">
358
+
<thead>
359
+
<tr>
360
+
<th><strong>Collection</strong></th>
361
+
<th><strong>Agent</strong></th>
362
+
<th><strong>LLoC</strong></th>
363
+
<th><strong>CC</strong></th>
364
+
<th><strong>MDL ratio</strong></th>
365
+
<th><strong>Pass rate</strong></th>
366
+
</tr>
367
+
</thead>
368
+
<tbody>
369
+
<tr>
370
+
<tdrowspan="3">datapipe</td>
371
+
<td>original</td>
372
+
<td>474</td>
373
+
<td>116</td>
374
+
<td>1.0</td>
375
+
<td>1.0</td>
376
+
</tr>
377
+
<tr>
378
+
<td>Cl-Cl</td>
379
+
<td>1084</td>
380
+
<td>341</td>
381
+
<td>9.1</td>
382
+
<td>1.0</td>
383
+
</tr>
384
+
<tr>
385
+
<td>Cl-Cx</td>
386
+
<td>645</td>
387
+
<td>174</td>
388
+
<td>2.2</td>
389
+
<td>1.0</td>
390
+
</tr>
391
+
<tr>
392
+
<tdrowspan="3">state_machine</td>
393
+
<td>original</td>
394
+
<td>619</td>
395
+
<td>222</td>
396
+
<td>1.0</td>
397
+
<td>1.0</td>
398
+
</tr>
399
+
<tr>
400
+
<td>Cl-Cl</td>
401
+
<td>2271</td>
402
+
<td>735</td>
403
+
<td>6.3</td>
404
+
<td>0.9</td>
405
+
</tr>
406
+
<tr>
407
+
<td>Cl-Cx</td>
408
+
<td>686</td>
409
+
<td>227</td>
410
+
<td>1.9</td>
411
+
<td>1.0</td>
412
+
</tr>
413
+
<tr>
414
+
<tdrowspan="3">config_schema</td>
415
+
<td>original</td>
416
+
<td>949</td>
417
+
<td>284</td>
418
+
<td>1.0</td>
419
+
<td>1.0</td>
420
+
</tr>
421
+
<tr>
422
+
<td>Cl-Cl</td>
423
+
<td>922</td>
424
+
<td>339</td>
425
+
<td>2.5</td>
426
+
<td>failed</td>
427
+
</tr>
428
+
<tr>
429
+
<td>Cl-Cx</td>
430
+
<td>626</td>
431
+
<td>207</td>
432
+
<td>2.1</td>
433
+
<td>1.0</td>
434
+
</tr>
435
+
<tr>
436
+
<tdrowspan="3">cli_tools</td>
437
+
<td>original</td>
438
+
<td>875</td>
439
+
<td>300</td>
440
+
<td>1.0</td>
441
+
<td>1.0</td>
442
+
</tr>
443
+
<tr>
444
+
<td>Cl-Cl</td>
445
+
<td>2925</td>
446
+
<td>909</td>
447
+
<td>4.0</td>
448
+
<td>1.0</td>
449
+
</tr>
450
+
<tr>
451
+
<td>Cl-Cx</td>
452
+
<td>5848</td>
453
+
<td>2378</td>
454
+
<td>3.1</td>
455
+
<td>failed</td>
456
+
</tr>
457
+
<tr>
458
+
<tdrowspan="3">cli_form</td>
459
+
<td>original</td>
460
+
<td>352</td>
461
+
<td>174</td>
462
+
<td>1.0</td>
463
+
<td>1.0</td>
464
+
</tr>
465
+
<tr>
466
+
<td>Cl-Cl</td>
467
+
<td>855</td>
468
+
<td>342</td>
469
+
<td>3.8</td>
470
+
<td>1.0</td>
471
+
</tr>
472
+
<tr>
473
+
<td>Cl-Cx</td>
474
+
<td>1366</td>
475
+
<td>548</td>
476
+
<td>2.9</td>
477
+
<td>1.0</td>
478
+
</tr>
479
+
</tbody>
480
+
</table>
481
+
<figcaptionstyle="text-align: center;">Table 4: Full results on MiniCode-repositories small, using Codex with o4-mini and Claude Code with Claude Sonnet 3.7</figcaption>
482
+
</figure>
483
+
484
+
<figureclass="table-figure">
485
+
<tableclass="table-styled">
486
+
<thead>
487
+
<tr>
488
+
<th><strong>Collection</strong></th>
489
+
<th><strong>Agent</strong></th>
490
+
<th><strong>LLoC</strong></th>
491
+
<th><strong>CC</strong></th>
492
+
<th><strong>MDL %</strong></th>
493
+
<th><strong>Pass %</strong></th>
494
+
</tr>
495
+
</thead>
496
+
<tbody>
497
+
<tr>
498
+
<tdrowspan="3">command_line_task_manager</td>
499
+
<td>original</td>
500
+
<td>7964</td>
501
+
<td>2921</td>
502
+
<td>100</td>
503
+
<td>100</td>
504
+
</tr>
505
+
<tr>
506
+
<td>sonnet 3.7</td>
507
+
<td>10387</td>
508
+
<td>3899</td>
509
+
<td>130</td>
510
+
<td>100</td>
511
+
</tr>
512
+
<tr>
513
+
<td>sonnet 4</td>
514
+
<td>9515</td>
515
+
<td>3946</td>
516
+
<td>125</td>
517
+
<td>100</td>
518
+
</tr>
519
+
<tr>
520
+
<tdrowspan="3">concurrent_task_scheduler</td>
521
+
<td>original</td>
522
+
<td>10350</td>
523
+
<td>3935</td>
524
+
<td>100</td>
525
+
<td>100</td>
526
+
</tr>
527
+
<tr>
528
+
<td>sonnet 3.7</td>
529
+
<td>18653</td>
530
+
<td>7165</td>
531
+
<td>190</td>
532
+
<td>80</td>
533
+
</tr>
534
+
<tr>
535
+
<td>sonnet 4</td>
536
+
<td>11491</td>
537
+
<td>4762</td>
538
+
<td>122</td>
539
+
<td>80</td>
540
+
</tr>
541
+
<tr>
542
+
<tdrowspan="3">file_system_analyzer</td>
543
+
<td>original</td>
544
+
<td>3911</td>
545
+
<td>1338</td>
546
+
<td>100</td>
547
+
<td>100</td>
548
+
</tr>
549
+
<tr>
550
+
<td>sonnet 3.7</td>
551
+
<td>6495</td>
552
+
<td>2218</td>
553
+
<td>160</td>
554
+
<td>100</td>
555
+
</tr>
556
+
<tr>
557
+
<td>sonnet 4</td>
558
+
<td>5221</td>
559
+
<td>1793</td>
560
+
<td>130</td>
561
+
<td>100</td>
562
+
</tr>
563
+
<tr>
564
+
<tdrowspan="3">in_memory_database</td>
565
+
<td>original</td>
566
+
<td>4565</td>
567
+
<td>1671</td>
568
+
<td>100</td>
569
+
<td>100</td>
570
+
</tr>
571
+
<tr>
572
+
<td>sonnet 3.7</td>
573
+
<td>6667</td>
574
+
<td>2556</td>
575
+
<td>150</td>
576
+
<td>100</td>
577
+
</tr>
578
+
<tr>
579
+
<td>sonnet 4</td>
580
+
<td>5617</td>
581
+
<td>2356</td>
582
+
<td>131</td>
583
+
<td>100</td>
584
+
</tr>
585
+
<tr>
586
+
<tdrowspan="3">incremental_backup_system</td>
587
+
<td>original</td>
588
+
<td>4587</td>
589
+
<td>1482</td>
590
+
<td>100</td>
591
+
<td>100</td>
592
+
</tr>
593
+
<tr>
594
+
<td>sonnet 3.7</td>
595
+
<td>5365</td>
596
+
<td>1788</td>
597
+
<td>130</td>
598
+
<td>failed</td>
599
+
</tr>
600
+
<tr>
601
+
<td>sonnet 4</td>
602
+
<td>6620</td>
603
+
<td>2168</td>
604
+
<td>152</td>
605
+
<td>60</td>
606
+
</tr>
607
+
<tr>
608
+
<tdrowspan="3">personal_finance_tracker</td>
609
+
<td>original</td>
610
+
<td>4306</td>
611
+
<td>1482</td>
612
+
<td>100</td>
613
+
<td>100</td>
614
+
</tr>
615
+
<tr>
616
+
<td>sonnet 3.7</td>
617
+
<td>7443</td>
618
+
<td>2512</td>
619
+
<td>180</td>
620
+
<td>63</td>
621
+
</tr>
622
+
<tr>
623
+
<td>sonnet 4</td>
624
+
<td>9212</td>
625
+
<td>3335</td>
626
+
<td>215</td>
627
+
<td>63</td>
628
+
</tr>
629
+
<tr>
630
+
<tdrowspan="3">personal_knowledge_management</td>
631
+
<td>original</td>
632
+
<td>5341</td>
633
+
<td>1832</td>
634
+
<td>100</td>
635
+
<td>100</td>
636
+
</tr>
637
+
<tr>
638
+
<td>sonnet 3.7</td>
639
+
<td>6357</td>
640
+
<td>2364</td>
641
+
<td>140</td>
642
+
<td>100</td>
643
+
</tr>
644
+
<tr>
645
+
<td>sonnet 4</td>
646
+
<td>5575</td>
647
+
<td>2597</td>
648
+
<td>131</td>
649
+
<td>80</td>
650
+
</tr>
651
+
<tr>
652
+
<tdrowspan="3">query_language_interpreter</td>
653
+
<td>original</td>
654
+
<td>5181</td>
655
+
<td>2105</td>
656
+
<td>100</td>
657
+
<td>100</td>
658
+
</tr>
659
+
<tr>
660
+
<td>sonnet 3.7</td>
661
+
<td>7193</td>
662
+
<td>2966</td>
663
+
<td>140</td>
664
+
<td>100</td>
665
+
</tr>
666
+
<tr>
667
+
<td>sonnet 4</td>
668
+
<td>7490</td>
669
+
<td>3076</td>
670
+
<td>142</td>
671
+
<td>100</td>
672
+
</tr>
673
+
<tr>
674
+
<tdrowspan="3">text_editor</td>
675
+
<td>original</td>
676
+
<td>4324</td>
677
+
<td>1323</td>
678
+
<td>100</td>
679
+
<td>100</td>
680
+
</tr>
681
+
<tr>
682
+
<td>sonnet 3.7</td>
683
+
<td>5792</td>
684
+
<td>1822</td>
685
+
<td>140</td>
686
+
<td>100</td>
687
+
</tr>
688
+
<tr>
689
+
<td>sonnet 4</td>
690
+
<td>6017</td>
691
+
<td>2308</td>
692
+
<td>148</td>
693
+
<td>100</td>
694
+
</tr>
695
+
<tr>
696
+
<tdrowspan="3">virtual_machine_emulator</td>
697
+
<td>original</td>
698
+
<td>6841</td>
699
+
<td>2283</td>
700
+
<td>100</td>
701
+
<td>100</td>
702
+
</tr>
703
+
<tr>
704
+
<td>sonnet 3.7</td>
705
+
<td>10108</td>
706
+
<td>3580</td>
707
+
<td>160</td>
708
+
<td>100</td>
709
+
</tr>
710
+
<tr>
711
+
<td>sonnet 4</td>
712
+
<td>9220</td>
713
+
<td>3303</td>
714
+
<td>137</td>
715
+
<td>100</td>
716
+
</tr>
717
+
</tbody>
718
+
</table>
719
+
<figcaptionstyle="text-align: center;">Table 5: Full results on MiniCode-repositories large, comparing the original code sources with Claude Sonnet 3.7 and Sonnet 4</figcaption>
720
+
</figure>
721
+
356
722
Check out the paper for the full CodeContests results, as well as repository results!
0 commit comments