adamzimmermann · August 15, 2016 19:31
diff --git a/ExampleMigrateSourceImage.php b/ExampleMigrateSourceImage.php
 <?php

 /**
 * Source Migration class for finding image tags in markup.
 */
 class ExampleMigrateSourceImage extends MigrateSource {

  /**
   * The string of HTML content.
   */
  private $content = '';

  /**
   * The array of matching image tags found.
   */
  private $matches = array();

  /**
   * The array index of the current match being imported.
   */
  private $matchesCurrent = 0;

  /**
   * Indicates if the content has been queried and parsed.
   */
  private $contentImported = FALSE;

  /**
   * {@inheritdoc}
   */
  public function __construct($options = array()) {
    parent::__construct($options);
  }

  /**
   * {@inheritdoc}
   */
  public function __toString() {
    return t('Create and download images referenced from a string of markup.');
  }

  /**
   * Return the number of available source records.
   */
  public function computeCount() {
    $this->importContent();
    $matches = $this->matches;
    if (is_array($matches)) {
      return count($matches);
    }
    else {
      return 0;
    }
  }

  /**
   * Returns a list of fields available to be mapped from the source.
   */
  public function fields() {
    return array(
      'alt' => t('Alt text'),
      'title' => t('Title text'),
      'url' => t('URL'),
      'credit' => t('Credit'),
      'filename' => t('Filename'),
      'node_title' => t('Title'),
    );
  }

  /**
   * Do whatever needs to be done to start a fresh traversal of the source data.
   *
   * This is always called at the start of an import, so tasks such as opening
   * file handles, running queries, and so on should be performed here.
   */
  public function performRewind() {
    $this->matchesCurrent = 0;
  }

  /**
   * Fetch the next row of data, returning it as an object.
   *
   * Return FALSE when there is no more data available.
   */
  public function getNextRow() {
    $this->importContent();
    if ($this->matchesCurrent < $this->computeCount()) {
      $row = new stdClass();
      // Add all of the values found in @see findMatches().
      $match = array_shift(array_slice($this->matches, $this->matchesCurrent, 1));
      foreach ($match as $key => $value) {
        $row->{$key} = $value;
      }
      // Increment the current match counter.
      $this->matchesCurrent++;
      return $row;
    }
    else {
      return FALSE;
    }
  }

  /**
   * Find and parse the source data if it hasn't already been done.
   */
  private function importContent() {
    if (!$this->contentImported) {
      // Build the content string to parse for images.
      $this->buildContent();
      // Find the images in the string and populate the matches array.
      $this->findImages();
      // Note that the import has been completed and does not need to be
      // performed again.
      $this->contentImported = TRUE;
    }
  }

  /**
   * Get all of the HTML that needs to be filtered for image tags and tokens.
   */
  private function buildContent() {
    $query = $this->contentQuery();
    $content = $query->execute()->fetchAll();
    if (!empty($content)) {
      // This builds one long string for parsing that can done on long strings
      // without using too much memory. Here, we add fields ‘foo’ and ‘bar’ from
      // the query.
      foreach ($content as $item) {
        $this->content .= $item->foo;
        $this->content .= $item->bar;
      }
      // This builds an array of content for parsing operations that need to be
      // performed on smaller chunks of the source data to avoid memory issues.
      // This is is only required if you run into parsing issues, otherwise it
      // can be removed.
      $this->contentArray[] = array(
        'title' => $item->post_title,
        'content' => $item->post_content,
        'id' => $item->id,
      );
    }
  }

  /**
   * Creates the query that will be used to build the content string.
   *
   * @return object
   *   A SelectQuery object.
   */
  protected function contentQuery() {
    $query = Database::getConnection('default', 'example')->select('table', 't');
    $query->fields('t', array(
      'foo',
      'bar',
    ));
    $query->condition('t.foo', '', '!=');
    $query->condition('t.bar', '', '!=');
    return $query;
  }

  /**
   * Finds the desired elements in the markup.
   */
  private function findImages() {
    // Verify that content was found.
    if (empty($this->content)) {
      $message = 'No HTML content with image tags to download could be found.';
      watchdog('example_migrate', $message);
      return FALSE;
    }

    // Find images where the entire source content string can be parsed at once.
    $this->findImageMethodOne();

    // Find images where the source content must be parsed in chunks.
    foreach ($this->contentArray as $id => $post) {
      $this->findImageMethodTwo($post);
    }
  }

  /**
   * This is an example of a image finding method.
   */
  private function findImageMethodOne() {
    // Create a regex to look through the content.
    $matches = array();
    $regex = '/regex/to/find/images/';
    preg_match_all($regex, $this->content, $matches, PREG_SET_ORDER);

    // Set a unique row identifier from some captured pattern of the regex-
    // this would likely be the full path to the image. You might need to
    // perform cleanup on this value to standardize it, as the path
    // to /foo/bar/image.jpg, example.com/foo/bar/image.jpg, and
    // http://example.com/foo/bar/image.jpg should not create three unique
    // source records. Standardizing the URL is key for not just avoiding
    // creating duplicate source records, but the URL is also the ID value you
    // will use in your destination class mapping callback that looks up the
    // resulting image entity ID from the data it finds in the body field.
    $id = 'http://example.com/foo/bar/image.jpg';

    // Add to the list of matches after performing more custom logic to
    // find all of the correct chunks of data we need. Be sure to set
    // every value here that you will need when constructing your entity later.
    $this->matches[$id] = array(
      'url' => $src,
      'alt' => $alttext,
      'title' => $description,
      'credit' => $credit,
      'id' => $id,
      'filename' => $filename,
      'custom_thing' => $custom_thing,
    );
  }

  /**
   * This is another example of a image finding method.
   */
  private function findImageMethodTwo() {
    // Some DOM library parsing code could live here. Then the images that were
    // found would be added to $this->matches just like they are added in the
    // findImageMethodOne() method.
  }

 }
	<?php

	/**
	* Source Migration class for finding image tags in markup.
	*/
	class ExampleMigrateSourceImage extends MigrateSource {

	/**
	* The string of HTML content.
	*/
	private $content = '';

	/**
	* The array of matching image tags found.
	*/
	private $matches = array();

	/**
	* The array index of the current match being imported.
	*/
	private $matchesCurrent = 0;

	/**
	* Indicates if the content has been queried and parsed.
	*/
	private $contentImported = FALSE;

	/**
	* {@inheritdoc}
	*/
	public function __construct($options = array()) {
	parent::__construct($options);
	}

	/**
	* {@inheritdoc}
	*/
	public function __toString() {
	return t('Create and download images referenced from a string of markup.');
	}

	/**
	* Return the number of available source records.
	*/
	public function computeCount() {
	$this->importContent();
	$matches = $this->matches;
	if (is_array($matches)) {
	return count($matches);
	}
	else {
	return 0;
	}
	}

	/**
	* Returns a list of fields available to be mapped from the source.
	*/
	public function fields() {
	return array(
	'alt' => t('Alt text'),
	'title' => t('Title text'),
	'url' => t('URL'),
	'credit' => t('Credit'),
	'filename' => t('Filename'),
	'node_title' => t('Title'),
	);
	}

	/**
	* Do whatever needs to be done to start a fresh traversal of the source data.
	*
	* This is always called at the start of an import, so tasks such as opening
	* file handles, running queries, and so on should be performed here.
	*/
	public function performRewind() {
	$this->matchesCurrent = 0;
	}

	/**
	* Fetch the next row of data, returning it as an object.
	*
	* Return FALSE when there is no more data available.
	*/
	public function getNextRow() {
	$this->importContent();
	if ($this->matchesCurrent < $this->computeCount()) {
	$row = new stdClass();
	// Add all of the values found in @see findMatches().
	$match = array_shift(array_slice($this->matches, $this->matchesCurrent, 1));
	foreach ($match as $key => $value) {
	$row->{$key} = $value;
	}
	// Increment the current match counter.
	$this->matchesCurrent++;
	return $row;
	}
	else {
	return FALSE;
	}
	}

	/**
	* Find and parse the source data if it hasn't already been done.
	*/
	private function importContent() {
	if (!$this->contentImported) {
	// Build the content string to parse for images.
	$this->buildContent();
	// Find the images in the string and populate the matches array.
	$this->findImages();
	// Note that the import has been completed and does not need to be
	// performed again.
	$this->contentImported = TRUE;
	}
	}

	/**
	* Get all of the HTML that needs to be filtered for image tags and tokens.
	*/
	private function buildContent() {
	$query = $this->contentQuery();
	$content = $query->execute()->fetchAll();
	if (!empty($content)) {
	// This builds one long string for parsing that can done on long strings
	// without using too much memory. Here, we add fields ‘foo’ and ‘bar’ from
	// the query.
	foreach ($content as $item) {
	$this->content .= $item->foo;
	$this->content .= $item->bar;
	}
	// This builds an array of content for parsing operations that need to be
	// performed on smaller chunks of the source data to avoid memory issues.
	// This is is only required if you run into parsing issues, otherwise it
	// can be removed.
	$this->contentArray[] = array(
	'title' => $item->post_title,
	'content' => $item->post_content,
	'id' => $item->id,
	);
	}
	}

	/**
	* Creates the query that will be used to build the content string.
	*
	* @return object
	* A SelectQuery object.
	*/
	protected function contentQuery() {
	$query = Database::getConnection('default', 'example')->select('table', 't');
	$query->fields('t', array(
	'foo',
	'bar',
	));
	$query->condition('t.foo', '', '!=');
	$query->condition('t.bar', '', '!=');
	return $query;
	}

	/**
	* Finds the desired elements in the markup.
	*/
	private function findImages() {
	// Verify that content was found.
	if (empty($this->content)) {
	$message = 'No HTML content with image tags to download could be found.';
	watchdog('example_migrate', $message);
	return FALSE;
	}

	// Find images where the entire source content string can be parsed at once.
	$this->findImageMethodOne();

	// Find images where the source content must be parsed in chunks.
	foreach ($this->contentArray as $id => $post) {
	$this->findImageMethodTwo($post);
	}
	}

	/**
	* This is an example of a image finding method.
	*/
	private function findImageMethodOne() {
	// Create a regex to look through the content.
	$matches = array();
	$regex = '/regex/to/find/images/';
	preg_match_all($regex, $this->content, $matches, PREG_SET_ORDER);

	// Set a unique row identifier from some captured pattern of the regex-
	// this would likely be the full path to the image. You might need to
	// perform cleanup on this value to standardize it, as the path
	// to /foo/bar/image.jpg, example.com/foo/bar/image.jpg, and
	// http://example.com/foo/bar/image.jpg should not create three unique
	// source records. Standardizing the URL is key for not just avoiding
	// creating duplicate source records, but the URL is also the ID value you
	// will use in your destination class mapping callback that looks up the
	// resulting image entity ID from the data it finds in the body field.
	$id = 'http://example.com/foo/bar/image.jpg';

	// Add to the list of matches after performing more custom logic to
	// find all of the correct chunks of data we need. Be sure to set
	// every value here that you will need when constructing your entity later.
	$this->matches[$id] = array(
	'url' => $src,
	'alt' => $alttext,
	'title' => $description,
	'credit' => $credit,
	'id' => $id,
	'filename' => $filename,
	'custom_thing' => $custom_thing,
	);
	}

	/**
	* This is another example of a image finding method.
	*/
	private function findImageMethodTwo() {
	// Some DOM library parsing code could live here. Then the images that were
	// found would be added to $this->matches just like they are added in the
	// findImageMethodOne() method.
	}

	}