spark-instrumented-optimizer/core/src/main/scala/spark/Utils.scala

package spark

import java.io._
import java.net.InetAddress
import java.util.concurrent.{Executors, ThreadFactory, ThreadPoolExecutor}

import scala.collection.mutable.ArrayBuffer
import scala.util.Random
import java.util.{Locale, UUID}

/**
 * Various utility methods used by Spark.
 */
object Utils {
  def serialize[T](o: T): Array[Byte] = {
    val bos = new ByteArrayOutputStream()
    val oos = new ObjectOutputStream(bos)
    oos.writeObject(o)
    oos.close
    return bos.toByteArray
  }

  def deserialize[T](bytes: Array[Byte]): T = {
    val bis = new ByteArrayInputStream(bytes)
    val ois = new ObjectInputStream(bis)
    return ois.readObject.asInstanceOf[T]
  }

  def deserialize[T](bytes: Array[Byte], loader: ClassLoader): T = {
    val bis = new ByteArrayInputStream(bytes)
    val ois = new ObjectInputStream(bis) {
      override def resolveClass(desc: ObjectStreamClass) =
        Class.forName(desc.getName, false, loader)
    }
    return ois.readObject.asInstanceOf[T]
  }

  def isAlpha(c: Char): Boolean = {
    (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')
  }

  def splitWords(s: String): Seq[String] = {
    val buf = new ArrayBuffer[String]
    var i = 0
    while (i < s.length) {
      var j = i
      while (j < s.length && isAlpha(s.charAt(j))) {
        j += 1
      }
      if (j > i) {
        buf += s.substring(i, j);
      }
      i = j
      while (i < s.length && !isAlpha(s.charAt(i))) {
        i += 1
      }
    }
    return buf
  }

  // Create a temporary directory inside the given parent directory
  def createTempDir(root: String = System.getProperty("java.io.tmpdir")): File = {
    var attempts = 0
    val maxAttempts = 10
    var dir: File = null
    while (dir == null) {
      attempts += 1
      if (attempts > maxAttempts) {
        throw new IOException("Failed to create a temp directory after " + maxAttempts + 
            " attempts!")
      }
      try {
        dir = new File(root, "spark-" + UUID.randomUUID.toString)
        if (dir.exists() || !dir.mkdirs()) {
          dir = null
        }
      } catch { case e: IOException => ; }
    }
    return dir
  }

  // Copy all data from an InputStream to an OutputStream
  def copyStream(in: InputStream,
                 out: OutputStream,
                 closeStreams: Boolean = false)
  {
    val buf = new Array[Byte](8192)
    var n = 0
    while (n != -1) {
      n = in.read(buf)
      if (n != -1) {
        out.write(buf, 0, n)
      }
    }
    if (closeStreams) {
      in.close()
      out.close()
    }
  }

  // Shuffle the elements of a collection into a random order, returning the
  // result in a new collection. Unlike scala.util.Random.shuffle, this method
  // uses a local random number generator, avoiding inter-thread contention.
  def randomize[T](seq: TraversableOnce[T]): Seq[T] = {
    val buf = new ArrayBuffer[T]()
    buf ++= seq
    val rand = new Random()
    for (i <- (buf.size - 1) to 1 by -1) {
      val j = rand.nextInt(i)
      val tmp = buf(j)
      buf(j) = buf(i)
      buf(i) = tmp
    }
    buf
  }

  /**
   * Get the local host's IP address in dotted-quad format (e.g. 1.2.3.4).
   */
  def localIpAddress(): String = InetAddress.getLocalHost.getHostAddress
  
  /**
   * Returns a standard ThreadFactory except all threads are daemons.
   */
  private def newDaemonThreadFactory: ThreadFactory = {
    new ThreadFactory {
      def newThread(r: Runnable): Thread = {
        var t = Executors.defaultThreadFactory.newThread (r)
        t.setDaemon (true)
        return t
      }
    }
  }

  /**
   * Wrapper over newCachedThreadPool.
   */
  def newDaemonCachedThreadPool(): ThreadPoolExecutor = {
    var threadPool = Executors.newCachedThreadPool.asInstanceOf[ThreadPoolExecutor]

    threadPool.setThreadFactory (newDaemonThreadFactory)

    return threadPool
  }

  /**
   * Wrapper over newFixedThreadPool.
   */
  def newDaemonFixedThreadPool(nThreads: Int): ThreadPoolExecutor = {
    var threadPool = Executors.newFixedThreadPool(nThreads).asInstanceOf[ThreadPoolExecutor]

    threadPool.setThreadFactory(newDaemonThreadFactory)

    return threadPool
  }

  /**
   * Get the local machine's hostname.
   */
  def localHostName(): String = InetAddress.getLocalHost.getHostName

  /**
   * Get current host
   */
  def getHost = System.getProperty("spark.hostname", localHostName())

  /**
   * Delete a file or directory and its contents recursively.
   */
  def deleteRecursively(file: File) {
    if (file.isDirectory) {
      for (child <- file.listFiles()) {
        deleteRecursively(child)
      }
    }
    if (!file.delete()) {
      throw new IOException("Failed to delete: " + file)
    }
  }

  /**
   * Use unit suffixes (Byte, Kilobyte, Megabyte, Gigabyte, Terabyte and
   * Petabyte) in order to reduce the number of digits to four or less. For
   * example, 4,000,000 is returned as 4MB.
   */
  def memoryBytesToString(size: Long): String = {
    val GB = 1L << 30
    val MB = 1L << 20
    val KB = 1L << 10

    val (value, unit) = {
      if (size >= 2*GB) {
        (size.asInstanceOf[Double] / GB, "GB")
      } else if (size >= 2*MB) {
        (size.asInstanceOf[Double] / MB, "MB")
      } else if (size >= 2*KB) {
        (size.asInstanceOf[Double] / KB, "KB")
      } else {
        (size.asInstanceOf[Double], "B")
      }
    }
    "%.1f%s".formatLocal(Locale.US, value, unit)
  }
}
Initial commit 2010-03-29 19:17:55 -04:00			`package spark`

			`import java.io._`
Added options for using an external HTTP server with LocalFileShuffle 2010-11-09 16:46:30 -05:00			`import java.net.InetAddress`
Refactoring: daemonThreadFactories have all been moved to the Utils object instead of having multiple copies in Broadcast and Shuffle objects. 2011-04-28 01:13:01 -04:00			`import java.util.concurrent.{Executors, ThreadFactory, ThreadPoolExecutor}`
Initial commit 2010-03-29 19:17:55 -04:00
Added splitWords function in Utils 2010-10-04 15:01:05 -04:00			`import scala.collection.mutable.ArrayBuffer`
Made DFS shuffle's "reduce tasks" fetch inputs in a random order so they don't all hit the same nodes at the same time. 2010-11-04 01:45:44 -04:00			`import scala.util.Random`
Utils.memoryBytesToString fixed 2012-05-19 09:13:20 -04:00			`import java.util.{Locale, UUID}`
Added splitWords function in Utils 2010-10-04 15:01:05 -04:00
Added the ability to specify a list of JAR files when creating a SparkContext and have the master node serve those to workers. 2010-10-16 19:14:13 -04:00			`/**`
			`* Various utility methods used by Spark.`
			`*/`
Added splitWords function in Utils 2010-10-04 15:01:05 -04:00			`object Utils {`
Added a closureSerializer field in SparkEnv and use it to serialize tasks. 2012-04-10 16:29:46 -04:00			`def serialize[T](o: T): Array[Byte] = {`
			`val bos = new ByteArrayOutputStream()`
			`val oos = new ObjectOutputStream(bos)`
			`oos.writeObject(o)`
			`oos.close`
			`return bos.toByteArray`
			`}`
Added an option (spark.closure.serializer) to specify the serializer for closures. This enables using Kryo as the closure serializer. 2012-04-10 00:59:56 -04:00
Added a closureSerializer field in SparkEnv and use it to serialize tasks. 2012-04-10 16:29:46 -04:00			`def deserialize[T](bytes: Array[Byte]): T = {`
			`val bis = new ByteArrayInputStream(bytes)`
			`val ois = new ObjectInputStream(bis)`
			`return ois.readObject.asInstanceOf[T]`
			`}`
Initial commit 2010-03-29 19:17:55 -04:00
			`def deserialize[T](bytes: Array[Byte], loader: ClassLoader): T = {`
Added a closureSerializer field in SparkEnv and use it to serialize tasks. 2012-04-10 16:29:46 -04:00			`val bis = new ByteArrayInputStream(bytes)`
			`val ois = new ObjectInputStream(bis) {`
			`override def resolveClass(desc: ObjectStreamClass) =`
			`Class.forName(desc.getName, false, loader)`
			`}`
			`return ois.readObject.asInstanceOf[T]`
Initial commit 2010-03-29 19:17:55 -04:00			`}`
Added splitWords function in Utils 2010-10-04 15:01:05 -04:00
Format the code a bit mroe. 2012-02-09 18:50:26 -05:00			`def isAlpha(c: Char): Boolean = {`
Added splitWords function in Utils 2010-10-04 15:01:05 -04:00			`(c >= 'A' && c <= 'Z') \|\| (c >= 'a' && c <= 'z')`
			`}`

			`def splitWords(s: String): Seq[String] = {`
			`val buf = new ArrayBuffer[String]`
			`var i = 0`
			`while (i < s.length) {`
			`var j = i`
			`while (j < s.length && isAlpha(s.charAt(j))) {`
			`j += 1`
			`}`
			`if (j > i) {`
			`buf += s.substring(i, j);`
			`}`
			`i = j`
			`while (i < s.length && !isAlpha(s.charAt(i))) {`
			`i += 1`
			`}`
			`}`
			`return buf`
			`}`
Added the ability to specify a list of JAR files when creating a SparkContext and have the master node serve those to workers. 2010-10-16 19:14:13 -04:00
			`// Create a temporary directory inside the given parent directory`
Code format. 2012-02-10 11:19:53 -05:00			`def createTempDir(root: String = System.getProperty("java.io.tmpdir")): File = {`
Added the ability to specify a list of JAR files when creating a SparkContext and have the master node serve those to workers. 2010-10-16 19:14:13 -04:00			`var attempts = 0`
			`val maxAttempts = 10`
			`var dir: File = null`
			`while (dir == null) {`
			`attempts += 1`
			`if (attempts > maxAttempts) {`
Code format. 2012-02-10 11:19:53 -05:00			`throw new IOException("Failed to create a temp directory after " + maxAttempts +`
			`" attempts!")`
Added the ability to specify a list of JAR files when creating a SparkContext and have the master node serve those to workers. 2010-10-16 19:14:13 -04:00			`}`
			`try {`
			`dir = new File(root, "spark-" + UUID.randomUUID.toString)`
			`if (dir.exists() \|\| !dir.mkdirs()) {`
			`dir = null`
			`}`
			`} catch { case e: IOException => ; }`
			`}`
			`return dir`
			`}`

			`// Copy all data from an InputStream to an OutputStream`
			`def copyStream(in: InputStream,`
			`out: OutputStream,`
			`closeStreams: Boolean = false)`
			`{`
			`val buf = new Array[Byte](8192)`
			`var n = 0`
			`while (n != -1) {`
			`n = in.read(buf)`
			`if (n != -1) {`
			`out.write(buf, 0, n)`
			`}`
			`}`
			`if (closeStreams) {`
			`in.close()`
			`out.close()`
			`}`
			`}`
Made DFS shuffle's "reduce tasks" fetch inputs in a random order so they don't all hit the same nodes at the same time. 2010-11-04 01:45:44 -04:00
			`// Shuffle the elements of a collection into a random order, returning the`
			`// result in a new collection. Unlike scala.util.Random.shuffle, this method`
			`// uses a local random number generator, avoiding inter-thread contention.`
Scheduler can now recover from lost map outputs 2011-05-20 03:19:53 -04:00			`def randomize[T](seq: TraversableOnce[T]): Seq[T] = {`
Made shuffle algorithm pluggable and added LocalFileShuffle. 2010-11-08 03:45:02 -05:00			`val buf = new ArrayBuffer[T]()`
			`buf ++= seq`
Made DFS shuffle's "reduce tasks" fetch inputs in a random order so they don't all hit the same nodes at the same time. 2010-11-04 01:45:44 -04:00			`val rand = new Random()`
			`for (i <- (buf.size - 1) to 1 by -1) {`
			`val j = rand.nextInt(i)`
			`val tmp = buf(j)`
			`buf(j) = buf(i)`
			`buf(i) = tmp`
			`}`
			`buf`
			`}`
Added options for using an external HTTP server with LocalFileShuffle 2010-11-09 16:46:30 -05:00
			`/**`
Delete Spark's temporary directories when the JVM exits. 2012-02-10 01:58:24 -05:00			`* Get the local host's IP address in dotted-quad format (e.g. 1.2.3.4).`
Added options for using an external HTTP server with LocalFileShuffle 2010-11-09 16:46:30 -05:00			`*/`
Refactoring: daemonThreadFactories have all been moved to the Utils object instead of having multiple copies in Broadcast and Shuffle objects. 2011-04-28 01:13:01 -04:00			`def localIpAddress(): String = InetAddress.getLocalHost.getHostAddress`

			`/**`
Delete Spark's temporary directories when the JVM exits. 2012-02-10 01:58:24 -05:00			`* Returns a standard ThreadFactory except all threads are daemons.`
Refactoring: daemonThreadFactories have all been moved to the Utils object instead of having multiple copies in Broadcast and Shuffle objects. 2011-04-28 01:13:01 -04:00			`*/`
			`private def newDaemonThreadFactory: ThreadFactory = {`
			`new ThreadFactory {`
			`def newThread(r: Runnable): Thread = {`
			`var t = Executors.defaultThreadFactory.newThread (r)`
			`t.setDaemon (true)`
			`return t`
			`}`
			`}`
			`}`

			`/**`
Delete Spark's temporary directories when the JVM exits. 2012-02-10 01:58:24 -05:00			`* Wrapper over newCachedThreadPool.`
Refactoring: daemonThreadFactories have all been moved to the Utils object instead of having multiple copies in Broadcast and Shuffle objects. 2011-04-28 01:13:01 -04:00			`*/`
			`def newDaemonCachedThreadPool(): ThreadPoolExecutor = {`
Code format. 2012-02-10 11:19:53 -05:00			`var threadPool = Executors.newCachedThreadPool.asInstanceOf[ThreadPoolExecutor]`
Refactoring: daemonThreadFactories have all been moved to the Utils object instead of having multiple copies in Broadcast and Shuffle objects. 2011-04-28 01:13:01 -04:00
			`threadPool.setThreadFactory (newDaemonThreadFactory)`

			`return threadPool`
			`}`

			`/**`
Delete Spark's temporary directories when the JVM exits. 2012-02-10 01:58:24 -05:00			`* Wrapper over newFixedThreadPool.`
Refactoring: daemonThreadFactories have all been moved to the Utils object instead of having multiple copies in Broadcast and Shuffle objects. 2011-04-28 01:13:01 -04:00			`*/`
			`def newDaemonFixedThreadPool(nThreads: Int): ThreadPoolExecutor = {`
Code format. 2012-02-10 11:19:53 -05:00			`var threadPool = Executors.newFixedThreadPool(nThreads).asInstanceOf[ThreadPoolExecutor]`
Refactoring: daemonThreadFactories have all been moved to the Utils object instead of having multiple copies in Broadcast and Shuffle objects. 2011-04-28 01:13:01 -04:00
			`threadPool.setThreadFactory(newDaemonThreadFactory)`

			`return threadPool`
Added options for using an external HTTP server with LocalFileShuffle 2010-11-09 16:46:30 -05:00			`}`
More work on new RDD design 2011-02-27 22:15:52 -05:00
			`/**`
Delete Spark's temporary directories when the JVM exits. 2012-02-10 01:58:24 -05:00			`* Get the local machine's hostname.`
More work on new RDD design 2011-02-27 22:15:52 -05:00			`*/`
Formating fixed 2012-05-19 09:14:37 -04:00			`def localHostName(): String = InetAddress.getLocalHost.getHostName`
Delete Spark's temporary directories when the JVM exits. 2012-02-10 01:58:24 -05:00
			`/**`
Little refactoring 2012-05-20 02:02:30 -04:00			`* Get current host`
			`*/`
			`def getHost = System.getProperty("spark.hostname", localHostName())`

			`/**`
Delete Spark's temporary directories when the JVM exits. 2012-02-10 01:58:24 -05:00			`* Delete a file or directory and its contents recursively.`
			`*/`
			`def deleteRecursively(file: File) {`
			`if (file.isDirectory) {`
			`for (child <- file.listFiles()) {`
			`deleteRecursively(child)`
			`}`
			`}`
			`if (!file.delete()) {`
			`throw new IOException("Failed to delete: " + file)`
			`}`
			`}`
Added the capacity to report cache usage status back to the cache trackor. This is essential for building a dashboard to see the status of caches on all slaves. 2012-05-14 21:39:04 -04:00
			`/**`
			`* Use unit suffixes (Byte, Kilobyte, Megabyte, Gigabyte, Terabyte and`
			`* Petabyte) in order to reduce the number of digits to four or less. For`
			`* example, 4,000,000 is returned as 4MB.`
			`*/`
Updated Cache's put method to use a case class for response. Previously it was pretty ugly that put() should return -1 for failures. 2012-05-15 03:31:52 -04:00			`def memoryBytesToString(size: Long): String = {`
Added the capacity to report cache usage status back to the cache trackor. This is essential for building a dashboard to see the status of caches on all slaves. 2012-05-14 21:39:04 -04:00			`val GB = 1L << 30`
			`val MB = 1L << 20`
			`val KB = 1L << 10`
Utils.memoryBytesToString fixed 2012-05-19 09:13:20 -04:00
Added the capacity to report cache usage status back to the cache trackor. This is essential for building a dashboard to see the status of caches on all slaves. 2012-05-14 21:39:04 -04:00			`val (value, unit) = {`
			`if (size >= 2*GB) {`
			`(size.asInstanceOf[Double] / GB, "GB")`
			`} else if (size >= 2*MB) {`
			`(size.asInstanceOf[Double] / MB, "MB")`
			`} else if (size >= 2*KB) {`
			`(size.asInstanceOf[Double] / KB, "KB")`
			`} else {`
			`(size.asInstanceOf[Double], "B")`
			`}`
			`}`
Utils.memoryBytesToString fixed 2012-05-19 09:13:20 -04:00			`"%.1f%s".formatLocal(Locale.US, value, unit)`
Added the capacity to report cache usage status back to the cache trackor. This is essential for building a dashboard to see the status of caches on all slaves. 2012-05-14 21:39:04 -04:00			`}`
Initial commit 2010-03-29 19:17:55 -04:00			`}`