logo sykohpath.com

				where code goes to die
			
	
tagstagstags

Sitemap XML Generator

Forgot I made this a while back...Ok, you know how there's "free" sitemap.xml generators?  And you know how they limit it to like, 10 links?  And you know how they suck?

Time to fix that.

Instructions:

1) Open "sitexmlgen.php"
2) Change the site to scan in line number 35:  $linklist[0] = "http://www.sykohpath.com/";
3) Place on your site
4) Run the script.  Example: www.yoursite.com/sitexmlgen.php
5) Wait until it finishes, will take long on large sites with many links.
6) When finished, view source
7) Copy ALL the text under "XML IS BELOW" section.
8) Paste into new document "sitemap.xml".
9) Done with generation!  If you don't know what to do with that file, learn to SEO.
10) there is no 10.


And here's the code.  Note that there *is* a 5000 link limit, imposed by sitemap.org:

Code Sample:
  1. <?php /* ---- Information ----
  2. Name: sitexmlgen.php  "sitemap.xml Generator"
  3. Last Updated:  20110512 080000
  4. Page Version: sitexmlgen.php v1.0
  5. Author: SyKoHPaTh
  6. ---- Version History ----
  7.   1.0  Initial Coding
  8. -------------------------
  9.   PURPOSE:
  10.    "crawl" a site and record the links if they are valid.
  11.    Note: this only picks up links between <a> tags.
  12. -------------------------
  13.   TODO:
  14. -------------------------
  15.   LICENSE:
  16.    Modification: OK, but must keep credit line: "SyKoHPaTh (www.sykohpath.com)", and this License.  Any modifications MUST be written in "Version History", with your name and/or handle, and what the modification was.
  17.    Free for public and commercial use.  If you paid for this, you got scammed.
  18. --------------------------
  19. */
  20. /* -------- VARIABLES -------- */
  21. $linklist = array();
  22. $linklist[0] = "http://www.sykohpath.com/";
  23. $sitemap_limit = 50000;  //enforced by sitemap.org, max number of <url> links in one sitemap XML file.
  24.         // there is also a 10MB limit to sitemap XML files, but we're not checking for that here.
  25. /* -------- FUNCTIONS -------- */
  26. function digger($scanlink) {
  27.   //does the work of scanning a page and putting links into an array
  28.   
  29.   $linkcontents = @file_get_contents&#40;$scanlink&#41;;
  30.   if(!$linkcontents) {
  31.    print "Unable to open: {$linkcheck}n";
  32.    return array();
  33.   }
  34.   $linkinfo = parse_url($scanlink);
  35.   $linkcore = $linkinfo['scheme'] . "://" . $linkinfo['host'];
  36.   $linkcontents_strip = strip_tags($linkcontents, "<a>");
  37.   $linkcontents_mod = preg_replace("/<a([^>]*)href="//is", "<a$1href="{$linkcore}/", $linkcontents_strip);
  38.   $linkcontents_mod = preg_replace("/<a([^>]*)href="?/is", "<a$1href="{$scanlink}/?", $linkcontents_mod);
  39.   preg_match_all("/<a(?:[^>]*)href="([^"]*)"(?:[^>]*)>(?:[^<]*)</a>/is", $linkcontents_mod, $matches);
  40.   return $matches[1];
  41. }
  42. function checklink($linkcheck) {
  43.   //simply checks a link to see if it loads up or not
  44.   $linkcontents = @file_get_contents&#40;$linkcheck&#41;;
  45.   if(!$linkcontents) {
  46.    print "Unable to open: {$linkcheck}n";
  47.    return false;
  48.   }
  49.   return true;
  50. }
  51. /* -------- Initial header thing -------- */
  52. $xmloutput = "<?xml version="1.0" encoding="UTF-8" ?>n<urlset   created with SyKoHPaTh's SiteMap.XML generator  www.sykohpath.com  -->n";
  53. $x = 0;
  54. while(1==1){
  55.   //gen list
  56.   print "Scanning [$x of " . (count($linklist)-1) . "]: " . $linklist[$x] . "<br>n";
  57.   $linkmatch = digger($linklist[$x]);
  58.   //scan list
  59.   foreach($linkmatch as $key=>$value){
  60.    //print $key . ": " . $value . "n";
  61.    //filter bad data
  62.     //check link against $linklist[0]
  63.    if(substr($value, 0, strlen($linklist[0])) == $linklist[0]){
  64.     if(!(in_array($value, $linklist))){
  65.      //push to array
  66.      $linklist[] = $value;
  67.      $xmloutput .= "<url>nt<loc>" . $value . "</loc>n</url>n";
  68.     }
  69.    } else {
  70.     //check if it's a foreign link
  71.     if(!substr($value, 0, 4) == "http"){
  72.      //add scanned linklist to front, and see if it's a valid link
  73.      //cut out everything after the slash:  http://w3dev.millerind.com/parts/index.php?bid=2
  74.      $pattern = preg_replace("/[^/]*$/s", "", $linklist[$x]);
  75.      $value = trim($value); //strip whitespace BAD CODER, BAD!
  76.      $value = preg_replace("/^[/]/s", "", $value); //strip beginning / if there is one
  77.      if(checklink($pattern . $value)){
  78.       $value = $pattern . $value;
  79.       if(substr($value, 0, strlen($linklist[0])) == $linklist[0]){
  80.        if(!(in_array($value, $linklist))){
  81.         //push to array
  82.         $linklist[] = $value;
  83.         $xmloutput .= "t<url>ntt<loc>" . $value . "</loc>n</url>n";
  84.        }
  85.       }    
  86.      }
  87.     }
  88.    }
  89.   }
  90.   //echo "Total links: " . count($linklist) . "<br>n";
  91.   //if nothing new was added, exit loop
  92.   if($x + 1 >= count($linklist)){ break; }
  93.   //if limit reached, exit loop
  94.   if($x > $sitemap_limit - 1){ break; }
  95.   $x=$x+1;
  96. }
  97. //Optional tags for each link.
  98. //<lastmod>" . date("Y-m-d") . "</lastmod>n
  99. //<changefreq>yearly</changefreq>n
  100. //<priority>0.5</priority>n
  101. $xmloutput .= "</urlset>";
  102. print "<br>n<br>n-----------------------------------------------------------<br>n           XML IS BELOW (view source)<br>n           Copy and paste into "sitemap.xml"<br>n-----------------------------------------------------------<br>
  103. n" . $xmloutput;
  104. ?>

php, xml, sitemap,


0 comments.