ABOUT ME

-

Today
-
Yesterday
-
Total
-
  • [Hadoop] Docker 로 Hadoop Cluster 실행하기 ( docker-compose )
    Hadoop 2021. 12. 15. 10:27
    728x90
    반응형

    - 목차

     

    소개.

    하둡을 이해하기 위해서 실습을 하고자하였습니다.

    하지만 리눅스 또는 AWS 에서의 실행은 VM 을 사용하거나 비용이 청구되는 수고로움이 있었습니다.

    그래서 도커로 실행하는 방법에 대해서 작성해보고자 합니다.

     

     

    Single NameNode & DataNode.

    아래 명령어는 1개의 NameNode 와 DataNode 를 실행하는 docker-compose.yaml 파일과 관련된 명령어들입니다.

     

    < hadoop condig file >

    cat <<EOF> /tmp/hadoop_config
    CORE-SITE.XML_fs.default.name=hdfs://namenode
    CORE-SITE.XML_fs.defaultFS=hdfs://namenode
    HDFS-SITE.XML_dfs.namenode.rpc-address=namenode:8020
    HDFS-SITE.XML_dfs.replication=1
    HDFS-SITE.XML_dfs.permissions.enabled=false
    CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.maximum-applications=10000
    CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.maximum-am-resource-percent=0.1
    CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.resource-calculator=org.apache.hadoop.yarn.util.resource.DefaultResourceCalculator
    CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.root.queues=default
    CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.root.default.capacity=100
    CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.root.default.user-limit-factor=1
    CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.root.default.maximum-capacity=100
    CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.root.default.state=RUNNING
    CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.root.default.acl_submit_applications=*
    CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.root.default.acl_administer_queue=*
    CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.node-locality-delay=40
    CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.queue-mappings=
    CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.queue-mappings-override.enable=false
    EOF

     

    < docker compose yaml file >

    cat <<EOF> /tmp/hadoop-docker-compose.yaml
    version: "2"
    services:
       namenode:
          platform: linux/amd64
          container_name: namenode
          image: apache/hadoop:3.3.6
          hostname: namenode
          command: ["hdfs", "namenode"]
          ports:
            - 9870:9870
            - 8020:8020
          env_file:
            - /tmp/hadoop_config
          environment:
              ENSURE_NAMENODE_DIR: "/tmp/hadoop-root/dfs/name"
          networks:
            - hadoop_network
       datanode:
          platform: linux/amd64   
          container_name: datanode
          depends_on:
            - namenode
          image: apache/hadoop:3.3.6
          command: ["hdfs", "datanode"]
          env_file:
            - /tmp/hadoop_config 
          networks:
            - hadoop_network
    networks:
      hadoop_network:
        name: hadoop_network        
    EOF

     

    < docker compose run >

    docker-compose -f /tmp/hadoop-docker-compose.yaml --project-name=hadoop up -d

     

     

    Docker 실행.

    1. hadoop config 파일을 생성합니다.

    하둡은 core-site.xml, hdfs-site.xml 등의 xml 설정을 기본으로 합니다.

    아래 명령어를 통해서 hadoop_config 정보를 생성합니다.

    cat <<EOF> /tmp/hadoop_config
    CORE-SITE.XML_fs.default.name=hdfs://namenode
    CORE-SITE.XML_fs.defaultFS=hdfs://namenode
    HDFS-SITE.XML_dfs.namenode.rpc-address=namenode:8020
    HDFS-SITE.XML_dfs.replication=3
    HDFS-SITE.XML_dfs.permissions.enabled=false
    MAPRED-SITE.XML_mapreduce.framework.name=yarn
    MAPRED-SITE.XML_yarn.app.mapreduce.am.env=HADOOP_MAPRED_HOME=$HADOOP_HOME
    MAPRED-SITE.XML_mapreduce.map.env=HADOOP_MAPRED_HOME=$HADOOP_HOME
    MAPRED-SITE.XML_mapreduce.reduce.env=HADOOP_MAPRED_HOME=$HADOOP_HOME
    YARN-SITE.XML_yarn.resourcemanager.hostname=resourcemanager
    YARN-SITE.XML_yarn.nodemanager.pmem-check-enabled=false
    YARN-SITE.XML_yarn.nodemanager.delete.debug-delay-sec=600
    YARN-SITE.XML_yarn.nodemanager.vmem-check-enabled=false
    YARN-SITE.XML_yarn.nodemanager.aux-services=mapreduce_shuffle
    CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.maximum-applications=10000
    CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.maximum-am-resource-percent=0.1
    CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.resource-calculator=org.apache.hadoop.yarn.util.resource.DefaultResourceCalculator
    CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.root.queues=default
    CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.root.default.capacity=100
    CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.root.default.user-limit-factor=1
    CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.root.default.maximum-capacity=100
    CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.root.default.state=RUNNING
    CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.root.default.acl_submit_applications=*
    CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.root.default.acl_administer_queue=*
    CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.node-locality-delay=40
    CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.queue-mappings=
    CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.queue-mappings-override.enable=false
    EOF

     

    2. hadoop docker-compose.yaml 생성합니다.

    아래 명령어를 통해서 docker-compose.yaml 을 생성합니다.

    yaml 파일의 위치는 /tmp 디렉토리로 설정하였습니다.

    NameNode, DataNode 를 각각 1개씩 생성합니다.

     

    cat <<EOF> /tmp/hadoop-docker-compose.yaml
    version: "2"
    services:
       namenode:
          platform: linux/amd64
          container_name: namenode
          image: apache/hadoop:3
          hostname: namenode
          command: ["hdfs", "namenode"]
          ports:
            - 9870:9870
            - 8020:8020
          env_file:
            - /tmp/hadoop_config
          environment:
              ENSURE_NAMENODE_DIR: "/tmp/hadoop-root/dfs/name"
          networks:
            - hadoop_network
       datanode1:
          platform: linux/amd64   
          container_name: datanode1
          depends_on:
            - namenode
          image: apache/hadoop:3
          command: ["hdfs", "datanode"]
          env_file:
            - /tmp/hadoop_config 
          networks:
            - hadoop_network
       datanode2:
          platform: linux/amd64   
          container_name: datanode2   
          depends_on:
            - namenode
          image: apache/hadoop:3
          command: ["hdfs", "datanode"]
          env_file:
            - /tmp/hadoop_config      
          networks:
            - hadoop_network
       datanode3:
          platform: linux/amd64   
          container_name: datanode3 
          depends_on:
            - namenode
          image: apache/hadoop:3
          command: ["hdfs", "datanode"]
          env_file:
            - /tmp/hadoop_config   
          networks:
            - hadoop_network
       resourcemanager:
          platform: linux/amd64   
          container_name: resourcemanager      
          image: apache/hadoop:3
          hostname: resourcemanager
          command: ["yarn", "resourcemanager"]
          ports:
             - 8088:8088
          env_file:
            - /tmp/hadoop_config
          volumes:
            - ./test.sh:/opt/test.sh
          networks:
            - hadoop_network
       nodemanager:
          platform: linux/amd64   
          container_name: nodemanager
          image: apache/hadoop:3
          command: ["yarn", "nodemanager"]
          env_file:
            - /tmp/hadoop_config
          networks:
            - hadoop_network
    
    networks:
      hadoop_network:
        name: hadoop_network        
    EOF

     

    3. docker-compose 를 실행합니다.

    docker-compose -f /tmp/hadoop-docker-compose.yaml --project-name=hadoop up -d

     

    docker-compose 가 실행되면 아래와 같이 docker container 들이 나열됩니다.

    docker ps 
    CONTAINER ID   IMAGE             COMMAND                   CREATED          STATUS          PORTS                                            NAMES
    dc7c36eb63c6   apache/hadoop:3   "/usr/local/bin/dumb…"   2 minutes ago    Up 2 minutes                                                     datanode2
    cf5c1785120f   apache/hadoop:3   "/usr/local/bin/dumb…"   2 minutes ago    Up 2 minutes                                                     datanode1
    768d845b5935   apache/hadoop:3   "/usr/local/bin/dumb…"   2 minutes ago    Up 2 minutes                                                     datanode3
    e365b16d1b47   apache/hadoop:3   "/usr/local/bin/dumb…"   2 minutes ago    Up 2 minutes    0.0.0.0:8020->8020/tcp, 0.0.0.0:9870->9870/tcp   namenode
    03d652828a11   apache/hadoop:3   "/usr/local/bin/dumb…"   13 minutes ago   Up 12 minutes                                                    tmp-nodemanager-1
    e9aaacf23a9b   apache/hadoop:3   "/usr/local/bin/dumb…"   13 minutes ago   Up 12 minutes   0.0.0.0:8088->8088/tcp                           tmp-resourcemanager-1

     

     

    그리고 http://localhost:9870/dfshealth.html#tab-overview

    링크를 통해서 하둡 대시보드를 확인할 수 있습니다.

     

     

     

     

    big-data-europe 버전의 docker-compose.

    https://github.com/big-data-europe/docker-hadoop/blob/master/resourcemanager/Dockerfile

    cat <<EOF> /tmp/hadoop.env
    CORE_CONF_fs_defaultFS=hdfs://namenode:9000
    CORE_CONF_hadoop_http_staticuser_user=root
    CORE_CONF_hadoop_proxyuser_hue_hosts=*
    CORE_CONF_hadoop_proxyuser_hue_groups=*
    CORE_CONF_io_compression_codecs=org.apache.hadoop.io.compress.SnappyCodec
    
    HDFS_CONF_dfs_webhdfs_enabled=true
    HDFS_CONF_dfs_permissions_enabled=false
    HDFS_CONF_dfs_namenode_datanode_registration_ip___hostname___check=false
    HDFS_CONF_dfs_replication=3
    
    YARN_CONF_yarn_log___aggregation___enable=true
    YARN_CONF_yarn_log_server_url=http://historyserver:8188/applicationhistory/logs/
    YARN_CONF_yarn_resourcemanager_recovery_enabled=true
    YARN_CONF_yarn_resourcemanager_store_class=org.apache.hadoop.yarn.server.resourcemanager.recovery.FileSystemRMStateStore
    YARN_CONF_yarn_resourcemanager_scheduler_class=org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler
    YARN_CONF_yarn_scheduler_capacity_root_default_maximum___allocation___mb=8192
    YARN_CONF_yarn_scheduler_capacity_root_default_maximum___allocation___vcores=4
    YARN_CONF_yarn_resourcemanager_fs_state___store_uri=/rmstate
    YARN_CONF_yarn_resourcemanager_system___metrics___publisher_enabled=true
    YARN_CONF_yarn_resourcemanager_hostname=resourcemanager
    YARN_CONF_yarn_resourcemanager_address=resourcemanager:8032
    YARN_CONF_yarn_resourcemanager_scheduler_address=resourcemanager:8030
    YARN_CONF_yarn_resourcemanager_resource__tracker_address=resourcemanager:8031
    YARN_CONF_yarn_timeline___service_enabled=true
    YARN_CONF_yarn_timeline___service_generic___application___history_enabled=true
    YARN_CONF_yarn_timeline___service_hostname=historyserver
    YARN_CONF_mapreduce_map_output_compress=true
    YARN_CONF_mapred_map_output_compress_codec=org.apache.hadoop.io.compress.SnappyCodec
    YARN_CONF_yarn_nodemanager_resource_memory___mb=16384
    YARN_CONF_yarn_nodemanager_resource_cpu___vcores=8
    YARN_CONF_yarn_nodemanager_disk___health___checker_max___disk___utilization___per___disk___percentage=98.5
    YARN_CONF_yarn_nodemanager_remote___app___log___dir=/app-logs
    YARN_CONF_yarn_nodemanager_aux___services=mapreduce_shuffle
    
    MAPRED_CONF_mapreduce_framework_name=yarn
    MAPRED_CONF_mapred_child_java_opts=-Xmx4096m
    MAPRED_CONF_mapreduce_map_memory_mb=4096
    MAPRED_CONF_mapreduce_reduce_memory_mb=8192
    MAPRED_CONF_mapreduce_map_java_opts=-Xmx3072m
    MAPRED_CONF_mapreduce_reduce_java_opts=-Xmx6144m
    MAPRED_CONF_yarn_app_mapreduce_am_env=HADOOP_MAPRED_HOME=/opt/hadoop-3.2.1/
    MAPRED_CONF_mapreduce_map_env=HADOOP_MAPRED_HOME=/opt/hadoop-3.2.1/
    MAPRED_CONF_mapreduce_reduce_env=HADOOP_MAPRED_HOME=/opt/hadoop-3.2.1/
    EOF
    
    
    cat <<EOF> /tmp/hadoop-docker-compose.yaml
    version: "3"
    
    services:
      namenode:
        image: bde2020/hadoop-namenode:2.0.0-hadoop3.2.1-java8
        container_name: namenode
        restart: always
        ports:
          - 9870:9870
          - 9000:9000
        volumes:
          - hadoop_namenode:/hadoop/dfs/name
        environment:
          - CLUSTER_NAME=test
        env_file:
          - /tmp/hadoop.env
    
      datanode1:
        image: bde2020/hadoop-datanode:2.0.0-hadoop3.2.1-java8
        container_name: datanode1   
        restart: always
        volumes:
          - hadoop_datanode1:/hadoop/dfs/data
        environment:
          SERVICE_PRECONDITION: "namenode:9870"
        env_file:
          - /tmp/hadoop.env
    
      datanode2:
        image: bde2020/hadoop-datanode:2.0.0-hadoop3.2.1-java8
        container_name: datanode2   
        restart: always
        volumes:
          - hadoop_datanode2:/hadoop/dfs/data
        environment:
          SERVICE_PRECONDITION: "namenode:9870"
        env_file:
          - /tmp/hadoop.env
    
      datanode3:
        image: bde2020/hadoop-datanode:2.0.0-hadoop3.2.1-java8
        container_name: datanode3
        restart: always
        volumes:
          - hadoop_datanode3:/hadoop/dfs/data
        environment:
          SERVICE_PRECONDITION: "namenode:9870"
        env_file:
          - /tmp/hadoop.env
      
      resourcemanager:
        image: bde2020/hadoop-resourcemanager:2.0.0-hadoop3.2.1-java8
        container_name: resourcemanager
        ports:
          - 8088:8088    
        restart: always
        environment:
          SERVICE_PRECONDITION: "namenode:9000 namenode:9870 datanode:9864"
        env_file:
          - /tmp/hadoop.env
    
      nodemanager1:
        image: bde2020/hadoop-nodemanager:2.0.0-hadoop3.2.1-java8
        container_name: nodemanager
        restart: always
        environment:
          SERVICE_PRECONDITION: "namenode:9000 namenode:9870 datanode:9864 resourcemanager:8088"
        env_file:
          - /tmp/hadoop.env
      
      historyserver:
        image: bde2020/hadoop-historyserver:2.0.0-hadoop3.2.1-java8
        container_name: historyserver
        restart: always
        environment:
          SERVICE_PRECONDITION: "namenode:9000 namenode:9870 datanode:9864 resourcemanager:8088"
        volumes:
          - hadoop_historyserver:/hadoop/yarn/timeline
        env_file:
          - /tmp/hadoop.env
      
    volumes:
      hadoop_namenode:
      hadoop_datanode1:
      hadoop_datanode2:
      hadoop_datanode3:
      hadoop_historyserver:
    
    EOF
    
    docker-compose -f /tmp/hadoop-docker-compose.yaml -p hadoop up -d

     

     

    반응형

    'Hadoop' 카테고리의 다른 글

    Hadoop DataNode 알아보기  (0) 2023.12.12
    Hadoop fsimage 알아보기  (0) 2023.12.11
    Hadoop Namespace 알아보기  (0) 2023.12.04
    Hadoop Edit Log 알아보기  (0) 2023.12.04
    Hadoop Block 알아보기  (0) 2023.11.27
Designed by Tistory.