MobileRead Forums - View Single Post - FLAG (Fanfiction.net Lightweight Automated Grabber)

ilovejedd · 04-18-2009, 01:57 AM

Grr... FanFiction.Net enjoys making my life miserable. They changed the page layout again. Hopefully, it'll stay the same for some time.

Here's the updated source. Caveat, didn't have the inclination to look for one of the cleaned up ones. This one is probably full of stray code that I use for testing. Oh well, works for me, though. For anyone using this with Calibre recipes, I've also updated the "print version" generator on utterlyinsane.org

PHP Code:


			
<?php



function ffnet_source_info() {

    return "\tFetches stories from fanfiction.net";

}



function ffnet_get_story($storyid, $meta = false) {

    $ffurl = 'www.fanfiction.net';

    $story['source'] = $ffurl;

    $ffurl = 'http://' . $ffurl;



    //get initial info

    $ch = curl_init();

    curl_setopt($ch, CURLOPT_USERAGENT, 'SIRG/0.1 (Stanza iPhone RSS Generator)');

    curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);

    curl_setopt($ch, CURLOPT_FOLLOWLOCATION, false);

    curl_setopt($ch, CURLOPT_URL, "$ffurl/s/$storyid/1");

    $story['pages'][1] = curl_exec($ch);



    //extract metadata

    $story['meta'] = ffnet_get_meta($story['pages'][1], $meta);



    //fetch remaining pages

    if(!$meta) {

        if(isset($story['meta']['chapters'])) {

            foreach($story['meta']['chapters'] as $key => $chaptitle) {

                if(!isset($story['pages'][$key])) {

                    curl_setopt($ch, CURLOPT_URL, "$ffurl/s/$storyid/$key");

                    $story['pages'][$key] = curl_exec($ch);

                }

            }

        }

        else $story['meta']['chapters'][1] = $story['meta']['title'];



        //clean pages

        foreach($story['pages'] as &$currpage) {

            preg_match("/<!-- start story -->(.+)<!-- end story -->/Usi", $currpage, $matches);

            $currpage = $matches[1];

        }

    }



    return $story;

}



function ffnet_get_meta($page, $nochapters) {

    $matches = array();



    //category and title

    if (preg_match('/.*<a href.+>.+Crossover<\/a> »/Ui', $page, $matches)) {

        preg_match('/.*<a href.+>(.+) Crossover<\/a> » <b>(.+)<\/b>/Ui', $page, $matches);

        $matches[1] = str_ireplace(',', '', $matches[1]);

        $tag = str_ireplace(' and ', ',', $matches[1]);

        //$tag = preg_replace('/<.+>/Ui', '', $tag);

        //$matches[1] = preg_replace('/<.+>/Ui', '', $matches[1]);

    }

    else preg_match('/.+ » <a href=.+>(.+)<\/a> » <b>(.+)<\/b>/Ui', $page, $matches);



    //author

    preg_match("/<a href='\/u\/[0-9]+\/.+'>(.+)<\/a>/Ui", $page, $author);

    $meta['title'] = $matches[2];

    $meta['author'] = $author[1];

    $meta['category'] = $matches[1];

    $meta['crosscat'] = $tag;



    //rating, lang, category

    preg_match("/Rated: <a href='http:\/\/www.fictionratings.com\/guide.php' target='rating'> (.+)<\/a> - (.+) - (.+) - Reviews: <a href='.+'>[0-9]+<\/a>( - Updated: ([0-9-]+))? - Published: ([0-9-]+) (- (Complete)+ )?- id:([0-9]+)/Ui", $page, $matches);

    $meta['rating'] = $matches[1];

    $meta['language'] = $matches[2];

    $meta['genre'] = $matches[3];

    $meta['date_pub'] = $matches[6];

    $meta['date_update'] = $matches[5];

    $meta['complete'] = $matches[8] == 'Complete' ? true : false;

    $meta['ficstatus'] = $matches[8] == 'Complete' ? 'Complete' : 'In-Progress';

    $meta['id'] = $matches[9];



    //description

    preg_match('/<meta name="description" content="(.+), (.+)(, pairing: (.+))?,  (.+)">/Ui', $page, $matches);

    $meta['category2'] = $matches[1];

    $meta['genre2'] = $matches[2];

    $meta['char'] = $matches[4];

    $meta['summary'] = $matches[5];

    $meta['summary'] = iconv('UTF-8', 'ISO-8859-15//TRANSLIT//IGNORE', $meta['summary']);



    //chapters

    if($nochapters){

        if(preg_match("/<SELECT title='chapter navigation' Name=chapter onChange=\".+\">(<option.+)<\/select>/Ui", $page, $matches)) {

            preg_match_all("/<option  value=[0-9]+ (selected)?>([0-9]+). (.+)(?=<option|$)/Ui", $matches[1], $matches);

            foreach($matches[2] as $key => $chapnum) $chaparr = $chapnum;

            $meta['chapters'] = $chaparr;

        }

    }

    else {

        if(preg_match("/<SELECT title='chapter navigation' Name=chapter onChange=\".+\">(<option.+)<\/select>/Ui", $page, $matches)) {

            preg_match_all("/<option  value=[0-9]+ (selected)?>([0-9]+). (.+)(?=<option|$)/Ui", $matches[1], $matches);

            foreach($matches[2] as $key => $chapnum) $chaparr[$chapnum] = $matches[3][$key];

            $meta['chapters'] = $chaparr;

        }

    }



    return $meta;

}



?>

04-18-2009, 01:57 AM	#97
ilovejedd hopeless n00b Posts: 5,136 Karma: 19597086 Join Date: Jan 2009 Location: in the middle of nowhere Device: PW4, PW3, Libra H2O, iPad 10.5, iPad 11, iPad 12.9	Grr... FanFiction.Net enjoys making my life miserable. They changed the page layout again. Hopefully, it'll stay the same for some time. Here's the updated source. Caveat, didn't have the inclination to look for one of the cleaned up ones. This one is probably full of stray code that I use for testing. Oh well, works for me, though. For anyone using this with Calibre recipes, I've also updated the "print version" generator on utterlyinsane.org PHP Code: <?php function ffnet_source_info() { return "\tFetches stories from fanfiction.net"; } function ffnet_get_story($storyid, $meta = false) { $ffurl = 'www.fanfiction.net'; $story['source'] = $ffurl; $ffurl = 'http://' . $ffurl; //get initial info $ch = curl_init(); curl_setopt($ch, CURLOPT_USERAGENT, 'SIRG/0.1 (Stanza iPhone RSS Generator)'); curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, false); curl_setopt($ch, CURLOPT_URL, "$ffurl/s/$storyid/1"); $story['pages'][1] = curl_exec($ch); //extract metadata $story['meta'] = ffnet_get_meta($story['pages'][1], $meta); //fetch remaining pages if(!$meta) { if(isset($story['meta']['chapters'])) { foreach($story['meta']['chapters'] as $key => $chaptitle) { if(!isset($story['pages'][$key])) { curl_setopt($ch, CURLOPT_URL, "$ffurl/s/$storyid/$key"); $story['pages'][$key] = curl_exec($ch); } } } else $story['meta']['chapters'][1] = $story['meta']['title']; //clean pages foreach($story['pages'] as &$currpage) { preg_match("/<!-- start story -->(.+)<!-- end story -->/Usi", $currpage, $matches); $currpage = $matches[1]; } } return $story; } function ffnet_get_meta($page, $nochapters) { $matches = array(); //category and title if (preg_match('/.<a href.+>.+Crossover<\/a> »/Ui', $page, $matches)) { preg_match('/.<a href.+>(.+) Crossover<\/a> » <b>(.+)<\/b>/Ui', $page, $matches); $matches[1] = str_ireplace(',', '', $matches[1]); $tag = str_ireplace(' and ', ',', $matches[1]); //$tag = preg_replace('/<.+>/Ui', '', $tag); //$matches[1] = preg_replace('/<.+>/Ui', '', $matches[1]); } else preg_match('/.+ » <a href=.+>(.+)<\/a> » <b>(.+)<\/b>/Ui', $page, $matches); //author preg_match("/<a href='\/u\/[0-9]+\/.+'>(.+)<\/a>/Ui", $page, $author); $meta['title'] = $matches[2]; $meta['author'] = $author[1]; $meta['category'] = $matches[1]; $meta['crosscat'] = $tag; //rating, lang, category preg_match("/Rated: <a href='http:\/\/www.fictionratings.com\/guide.php' target='rating'> (.+)<\/a> - (.+) - (.+) - Reviews: <a href='.+'>[0-9]+<\/a>( - Updated: ([0-9-]+))? - Published: ([0-9-]+) (- (Complete)+ )?- id:([0-9]+)/Ui", $page, $matches); $meta['rating'] = $matches[1]; $meta['language'] = $matches[2]; $meta['genre'] = $matches[3]; $meta['date_pub'] = $matches[6]; $meta['date_update'] = $matches[5]; $meta['complete'] = $matches[8] == 'Complete' ? true : false; $meta['ficstatus'] = $matches[8] == 'Complete' ? 'Complete' : 'In-Progress'; $meta['id'] = $matches[9]; //description preg_match('/<meta name="description" content="(.+), (.+)(, pairing: (.+))?, (.+)">/Ui', $page, $matches); $meta['category2'] = $matches[1]; $meta['genre2'] = $matches[2]; $meta['char'] = $matches[4]; $meta['summary'] = $matches[5]; $meta['summary'] = iconv('UTF-8', 'ISO-8859-15//TRANSLIT//IGNORE', $meta['summary']); //chapters if($nochapters){ if(preg_match("/<SELECT title='chapter navigation' Name=chapter onChange=\".+\">(<option.+)<\/select>/Ui", $page, $matches)) { preg_match_all("/<option value=[0-9]+ (selected)?>([0-9]+). (.+)(?=<option\|$)/Ui", $matches[1], $matches); foreach($matches[2] as $key => $chapnum) $chaparr = $chapnum; $meta['chapters'] = $chaparr; } } else { if(preg_match("/<SELECT title='chapter navigation' Name=chapter onChange=\".+\">(<option.+)<\/select>/Ui", $page, $matches)) { preg_match_all("/<option value=[0-9]+ (selected)?>([0-9]+). (.+)(?=<option\|$)/Ui", $matches[1], $matches); foreach($matches[2] as $key => $chapnum) $chaparr[$chapnum] = $matches[3][$key]; $meta['chapters'] = $chaparr; } } return $meta; } ?>