[{"data":1,"prerenderedAt":1629},["ShallowReactive",2],{"doc-en-\u002Fen\u002Fdocs\u002Ftroubleshooting\u002Fcommon-problems":3,"docs-en-all":1562},{"id":4,"title":5,"body":6,"category":1545,"description":1546,"draft":1547,"extension":1548,"icon":1549,"lastReviewed":1550,"meta":1551,"navigation":227,"order":44,"path":1552,"prerequisites":1553,"readingTime":1554,"seo":1555,"stem":1556,"tags":1557,"__hash__":1561},"docs_en\u002Fen\u002Fdocs\u002Ftroubleshooting\u002Fcommon-problems.md","Troubleshooting common problems",{"type":7,"value":8,"toc":1529},"minimark",[9,13,18,25,30,88,91,96,99,132,135,175,179,192,196,199,314,318,339,371,374,378,387,391,404,407,411,414,436,439,460,473,477,490,494,512,515,519,522,530,568,572,577,581,611,618,622,625,689,692,696,701,705,746,749,753,764,768,773,777,801,804,808,811,836,839,843,851,855,884,891,895,898,968,971,988,992,1005,1009,1012,1047,1058,1062,1065,1102,1105,1109,1114,1118,1132,1135,1139,1142,1157,1160,1186,1190,1199,1203,1250,1253,1257,1260,1317,1320,1350,1354,1365,1369,1402,1405,1409,1412,1435,1438,1455,1458,1462,1465,1489,1499,1503,1525],[10,11,12],"p",{},"This guide covers the twelve problems that most often appear in HeroCtl clusters. Each item has symptom, diagnosis, and fix. Use as a quick reference during an incident.",[14,15,17],"h2",{"id":16},"_1-cluster-wont-start-cannot-bind-to-port-8080","1. Cluster won't start: \"cannot bind to port 8080\"",[10,19,20,24],{},[21,22,23],"strong",{},"Symptom:"," the service comes up and dies right after. The log says port 8080 is in use.",[10,26,27],{},[21,28,29],{},"Diagnosis:",[31,32,37],"pre",{"className":33,"code":34,"language":35,"meta":36,"style":36},"language-bash shiki shiki-themes github-dark-default","sudo lsof -i :8080\n# ou\nsudo ss -tlnp | grep 8080\n","bash","",[38,39,40,60,67],"code",{"__ignoreMap":36},[41,42,45,49,53,57],"span",{"class":43,"line":44},"line",1,[41,46,48],{"class":47},"sQhOw","sudo",[41,50,52],{"class":51},"s9uIt"," lsof",[41,54,56],{"class":55},"sFSAA"," -i",[41,58,59],{"class":51}," :8080\n",[41,61,63],{"class":43,"line":62},2,[41,64,66],{"class":65},"sH3jZ","# ou\n",[41,68,70,72,75,78,82,85],{"class":43,"line":69},3,[41,71,48],{"class":47},[41,73,74],{"class":51}," ss",[41,76,77],{"class":55}," -tlnp",[41,79,81],{"class":80},"suJrU"," |",[41,83,84],{"class":47}," grep",[41,86,87],{"class":55}," 8080\n",[10,89,90],{},"The output shows which process is holding the port.",[10,92,93],{},[21,94,95],{},"Fix:",[10,97,98],{},"If it's a legitimate process (another app), change the HeroCtl port:",[31,100,104],{"className":101,"code":102,"language":103,"meta":36,"style":36},"language-yaml shiki shiki-themes github-dark-default","# \u002Fetc\u002Fheroctl\u002Fserver.yaml\napi:\n  port: 8090\n","yaml",[38,105,106,111,121],{"__ignoreMap":36},[41,107,108],{"class":43,"line":44},[41,109,110],{"class":65},"# \u002Fetc\u002Fheroctl\u002Fserver.yaml\n",[41,112,113,117],{"class":43,"line":62},[41,114,116],{"class":115},"sPWt5","api",[41,118,120],{"class":119},"sZEs4",":\n",[41,122,123,126,129],{"class":43,"line":69},[41,124,125],{"class":115},"  port",[41,127,128],{"class":119},": ",[41,130,131],{"class":55},"8090\n",[10,133,134],{},"If it's a zombie process (old HeroCtl that didn't die cleanly):",[31,136,138],{"className":33,"code":137,"language":35,"meta":36,"style":36},"sudo kill -9 \u003CPID>\nsudo systemctl start heroctl-server\n",[38,139,140,162],{"__ignoreMap":36},[41,141,142,144,147,150,153,156,159],{"class":43,"line":44},[41,143,48],{"class":47},[41,145,146],{"class":51}," kill",[41,148,149],{"class":55}," -9",[41,151,152],{"class":80}," \u003C",[41,154,155],{"class":51},"PI",[41,157,158],{"class":119},"D",[41,160,161],{"class":80},">\n",[41,163,164,166,169,172],{"class":43,"line":62},[41,165,48],{"class":47},[41,167,168],{"class":51}," systemctl",[41,170,171],{"class":51}," start",[41,173,174],{"class":51}," heroctl-server\n",[14,176,178],{"id":177},"_2-node-cannot-join-the-cluster","2. Node cannot join the cluster",[10,180,181,183,184,187,188,191],{},[21,182,23],{}," the join command hangs or returns ",[38,185,186],{},"connection refused"," \u002F ",[38,189,190],{},"invalid token",".",[10,193,194],{},[21,195,29],{},[10,197,198],{},"Three common suspects:",[31,200,202],{"className":33,"code":201,"language":35,"meta":36,"style":36},"# 1. Token expirou?\nheroctl cluster join-token list\n\n# 2. Firewall bloqueando?\nnc -zv \u003Cip-do-coordenador> 4646\nnc -zv \u003Cip-do-coordenador> 4647\nnc -zv \u003Cip-do-coordenador> 4648\n\n# 3. Relógios fora de sincronia?\ntimedatectl status\n",[38,203,204,209,223,229,235,258,276,294,299,305],{"__ignoreMap":36},[41,205,206],{"class":43,"line":44},[41,207,208],{"class":65},"# 1. Token expirou?\n",[41,210,211,214,217,220],{"class":43,"line":62},[41,212,213],{"class":47},"heroctl",[41,215,216],{"class":51}," cluster",[41,218,219],{"class":51}," join-token",[41,221,222],{"class":51}," list\n",[41,224,225],{"class":43,"line":69},[41,226,228],{"emptyLinePlaceholder":227},true,"\n",[41,230,232],{"class":43,"line":231},4,[41,233,234],{"class":65},"# 2. Firewall bloqueando?\n",[41,236,238,241,244,246,249,252,255],{"class":43,"line":237},5,[41,239,240],{"class":47},"nc",[41,242,243],{"class":55}," -zv",[41,245,152],{"class":80},[41,247,248],{"class":51},"ip-do-coordenado",[41,250,251],{"class":119},"r",[41,253,254],{"class":80},">",[41,256,257],{"class":55}," 4646\n",[41,259,261,263,265,267,269,271,273],{"class":43,"line":260},6,[41,262,240],{"class":47},[41,264,243],{"class":55},[41,266,152],{"class":80},[41,268,248],{"class":51},[41,270,251],{"class":119},[41,272,254],{"class":80},[41,274,275],{"class":55}," 4647\n",[41,277,279,281,283,285,287,289,291],{"class":43,"line":278},7,[41,280,240],{"class":47},[41,282,243],{"class":55},[41,284,152],{"class":80},[41,286,248],{"class":51},[41,288,251],{"class":119},[41,290,254],{"class":80},[41,292,293],{"class":55}," 4648\n",[41,295,297],{"class":43,"line":296},8,[41,298,228],{"emptyLinePlaceholder":227},[41,300,302],{"class":43,"line":301},9,[41,303,304],{"class":65},"# 3. Relógios fora de sincronia?\n",[41,306,308,311],{"class":43,"line":307},10,[41,309,310],{"class":47},"timedatectl",[41,312,313],{"class":51}," status\n",[10,315,316],{},[21,317,95],{},[319,320,321,328,336],"ul",{},[322,323,324,325,191],"li",{},"Token expired: generate another with ",[38,326,327],{},"heroctl cluster join-token create --ttl 1h",[322,329,330,331,191],{},"Firewall: open ports 4646, 4647, and 4648 between nodes. See ",[332,333,335],"a",{"href":334},"\u002Fen\u002Fdocs\u002Fnetworking\u002Ffirewall","firewall",[322,337,338],{},"Clock: install and enable NTP.",[31,340,342],{"className":33,"code":341,"language":35,"meta":36,"style":36},"sudo apt install chrony\nsudo systemctl enable --now chrony\n",[38,343,344,357],{"__ignoreMap":36},[41,345,346,348,351,354],{"class":43,"line":44},[41,347,48],{"class":47},[41,349,350],{"class":51}," apt",[41,352,353],{"class":51}," install",[41,355,356],{"class":51}," chrony\n",[41,358,359,361,363,366,369],{"class":43,"line":62},[41,360,48],{"class":47},[41,362,168],{"class":51},[41,364,365],{"class":51}," enable",[41,367,368],{"class":55}," --now",[41,370,356],{"class":51},[10,372,373],{},"A drift greater than 30 seconds between nodes breaks coordination.",[14,375,377],{"id":376},"_3-cluster-lost-coordination","3. Cluster lost coordination",[10,379,380,382,383,386],{},[21,381,23],{}," API responds with ",[38,384,385],{},"503"," and messages about a missing coordinator. Changes are not accepted.",[10,388,389],{},[21,390,29],{},[31,392,394],{"className":33,"code":393,"language":35,"meta":36,"style":36},"heroctl cluster status\n",[38,395,396],{"__ignoreMap":36},[41,397,398,400,402],{"class":43,"line":44},[41,399,213],{"class":47},[41,401,216],{"class":51},[41,403,313],{"class":51},[10,405,406],{},"You'll see how many nodes are healthy. If fewer than half respond, the cluster locks into read-only mode for safety.",[10,408,409],{},[21,410,95],{},[10,412,413],{},"The normal solution is to bring the downed nodes back:",[31,415,417],{"className":33,"code":416,"language":35,"meta":36,"style":36},"ssh nó-caído sudo systemctl start heroctl-server\n",[38,418,419],{"__ignoreMap":36},[41,420,421,424,427,430,432,434],{"class":43,"line":44},[41,422,423],{"class":47},"ssh",[41,425,426],{"class":51}," nó-caído",[41,428,429],{"class":51}," sudo",[41,431,168],{"class":51},[41,433,171],{"class":51},[41,435,174],{"class":51},[10,437,438],{},"If they don't come back (dead disk, lost machine), use forced bootstrap from the latest snapshot:",[31,440,442],{"className":33,"code":441,"language":35,"meta":36,"style":36},"heroctl snapshot restore \u002Fbackups\u002Fultimo.tar.gz --force-bootstrap\n",[38,443,444],{"__ignoreMap":36},[41,445,446,448,451,454,457],{"class":43,"line":44},[41,447,213],{"class":47},[41,449,450],{"class":51}," snapshot",[41,452,453],{"class":51}," restore",[41,455,456],{"class":51}," \u002Fbackups\u002Fultimo.tar.gz",[41,458,459],{"class":55}," --force-bootstrap\n",[461,462,463],"blockquote",{},[10,464,465,468,469,191],{},[21,466,467],{},"Warning:"," forced bootstrap discards everything that happened after the snapshot. See ",[332,470,472],{"href":471},"\u002Fen\u002Fdocs\u002Fobservability\u002Fbackup-restore","backup and restore",[14,474,476],{"id":475},"_4-job-stuck-in-pending","4. Job stuck in \"pending\"",[10,478,479,481,482,485,486,489],{},[21,480,23],{}," ",[38,483,484],{},"heroctl jobs status meu-job"," shows ",[38,487,488],{},"pending"," for minutes. Nothing starts.",[10,491,492],{},[21,493,29],{},[31,495,497],{"className":33,"code":496,"language":35,"meta":36,"style":36},"heroctl jobs explain meu-job\n",[38,498,499],{"__ignoreMap":36},[41,500,501,503,506,509],{"class":43,"line":44},[41,502,213],{"class":47},[41,504,505],{"class":51}," jobs",[41,507,508],{"class":51}," explain",[41,510,511],{"class":51}," meu-job\n",[10,513,514],{},"The output details why the scheduler can't place the job. Typical cause: no node has the CPU\u002FRAM available for the requested resources.",[10,516,517],{},[21,518,95],{},[10,520,521],{},"Two options:",[319,523,524,527],{},[322,525,526],{},"Add more nodes to the cluster.",[322,528,529],{},"Reduce the resources required by the job:",[31,531,533],{"className":101,"code":532,"language":103,"meta":36,"style":36},"resources:\n  cpu_mhz: 500      # antes era 2000\n  memory_mb: 256    # antes era 1024\n",[38,534,535,542,555],{"__ignoreMap":36},[41,536,537,540],{"class":43,"line":44},[41,538,539],{"class":115},"resources",[41,541,120],{"class":119},[41,543,544,547,549,552],{"class":43,"line":62},[41,545,546],{"class":115},"  cpu_mhz",[41,548,128],{"class":119},[41,550,551],{"class":55},"500",[41,553,554],{"class":65},"      # antes era 2000\n",[41,556,557,560,562,565],{"class":43,"line":69},[41,558,559],{"class":115},"  memory_mb",[41,561,128],{"class":119},[41,563,564],{"class":55},"256",[41,566,567],{"class":65},"    # antes era 1024\n",[14,569,571],{"id":570},"_5-health-check-failing","5. Health check failing",[10,573,574,576],{},[21,575,23],{}," the job comes up but is marked as unhealthy. Restarts in a loop.",[10,578,579],{},[21,580,29],{},[31,582,584],{"className":33,"code":583,"language":35,"meta":36,"style":36},"heroctl logs \u003Calloc-id> | tail -50\n",[38,585,586],{"__ignoreMap":36},[41,587,588,590,593,595,598,601,603,605,608],{"class":43,"line":44},[41,589,213],{"class":47},[41,591,592],{"class":51}," logs",[41,594,152],{"class":80},[41,596,597],{"class":51},"alloc-i",[41,599,600],{"class":119},"d",[41,602,254],{"class":80},[41,604,81],{"class":80},[41,606,607],{"class":47}," tail",[41,609,610],{"class":55}," -50\n",[10,612,613,614,617],{},"Frequently the app is taking longer to start than ",[38,615,616],{},"healthy_deadline"," allows.",[10,619,620],{},[21,621,95],{},[10,623,624],{},"Increase the deadline:",[31,626,628],{"className":101,"code":627,"language":103,"meta":36,"style":36},"health_check:\n  path: \u002Fhealth\n  port: 8080\n  interval: 10s\n  timeout: 3s\n  healthy_deadline: 120s    # era 30s\n",[38,629,630,637,647,656,666,676],{"__ignoreMap":36},[41,631,632,635],{"class":43,"line":44},[41,633,634],{"class":115},"health_check",[41,636,120],{"class":119},[41,638,639,642,644],{"class":43,"line":62},[41,640,641],{"class":115},"  path",[41,643,128],{"class":119},[41,645,646],{"class":51},"\u002Fhealth\n",[41,648,649,651,653],{"class":43,"line":69},[41,650,125],{"class":115},[41,652,128],{"class":119},[41,654,655],{"class":55},"8080\n",[41,657,658,661,663],{"class":43,"line":231},[41,659,660],{"class":115},"  interval",[41,662,128],{"class":119},[41,664,665],{"class":51},"10s\n",[41,667,668,671,673],{"class":43,"line":237},[41,669,670],{"class":115},"  timeout",[41,672,128],{"class":119},[41,674,675],{"class":51},"3s\n",[41,677,678,681,683,686],{"class":43,"line":260},[41,679,680],{"class":115},"  healthy_deadline",[41,682,128],{"class":119},[41,684,685],{"class":51},"120s",[41,687,688],{"class":65},"    # era 30s\n",[10,690,691],{},"If the app really is slow to start (loads a huge cache, connects to several services), work on boot time. Lazy loading usually solves it.",[14,693,695],{"id":694},"_6-tls-certificate-not-issued","6. TLS certificate not issued",[10,697,698,700],{},[21,699,23],{}," the site responds with a self-signed certificate or a TLS error. Ingress logs mention failure in automatic issuance.",[10,702,703],{},[21,704,29],{},[31,706,708],{"className":33,"code":707,"language":35,"meta":36,"style":36},"# DNS aponta pro IP correto?\ndig +short meudominio.com\n\n# Porta 80 acessível externamente?\ncurl -I http:\u002F\u002Fmeudominio.com\u002F.well-known\u002Facme-challenge\u002Ftest\n",[38,709,710,715,726,730,735],{"__ignoreMap":36},[41,711,712],{"class":43,"line":44},[41,713,714],{"class":65},"# DNS aponta pro IP correto?\n",[41,716,717,720,723],{"class":43,"line":62},[41,718,719],{"class":47},"dig",[41,721,722],{"class":51}," +short",[41,724,725],{"class":51}," meudominio.com\n",[41,727,728],{"class":43,"line":69},[41,729,228],{"emptyLinePlaceholder":227},[41,731,732],{"class":43,"line":231},[41,733,734],{"class":65},"# Porta 80 acessível externamente?\n",[41,736,737,740,743],{"class":43,"line":237},[41,738,739],{"class":47},"curl",[41,741,742],{"class":55}," -I",[41,744,745],{"class":51}," http:\u002F\u002Fmeudominio.com\u002F.well-known\u002Facme-challenge\u002Ftest\n",[10,747,748],{},"Automatic certificate issuance needs two things: public DNS pointing at a cluster node and port 80 open to the world.",[10,750,751],{},[21,752,95],{},[319,754,755,758,761],{},[322,756,757],{},"Wrong DNS: fix the A record at your provider.",[322,759,760],{},"Port 80 closed: open it on the server's firewall and the provider's firewall (security group, etc.).",[322,762,763],{},"Domain with active CDN proxy: turn off the proxy temporarily for issuance; re-enable afterwards.",[14,765,767],{"id":766},"_7-app-slow-under-load","7. App slow under load",[10,769,770,772],{},[21,771,23],{}," latency rises when traffic grows. Users complain.",[10,774,775],{},[21,776,29],{},[31,778,780],{"className":33,"code":779,"language":35,"meta":36,"style":36},"heroctl metrics --job meu-app --since 30m\n",[38,781,782],{"__ignoreMap":36},[41,783,784,786,789,792,795,798],{"class":43,"line":44},[41,785,213],{"class":47},[41,787,788],{"class":51}," metrics",[41,790,791],{"class":55}," --job",[41,793,794],{"class":51}," meu-app",[41,796,797],{"class":55}," --since",[41,799,800],{"class":51}," 30m\n",[10,802,803],{},"Look at CPU, memory, and instance count. Also check whether a deploy is in flight — gradual deploys temporarily remove capacity.",[10,805,806],{},[21,807,95],{},[10,809,810],{},"If capacity is short, scale:",[31,812,814],{"className":33,"code":813,"language":35,"meta":36,"style":36},"heroctl jobs scale meu-app --count 6   # de 3 para 6\n",[38,815,816],{"__ignoreMap":36},[41,817,818,820,822,825,827,830,833],{"class":43,"line":44},[41,819,213],{"class":47},[41,821,505],{"class":51},[41,823,824],{"class":51}," scale",[41,826,794],{"class":51},[41,828,829],{"class":55}," --count",[41,831,832],{"class":55}," 6",[41,834,835],{"class":65},"   # de 3 para 6\n",[10,837,838],{},"If a deploy is in flight, wait for it to finish before evaluating. If the app has a memory leak or a tight loop, profile the code — there's nothing the orchestrator can do about an app's internal problem.",[14,840,842],{"id":841},"_8-logs-dont-show-up","8. Logs don't show up",[10,844,845,481,847,850],{},[21,846,23],{},[38,848,849],{},"heroctl logs"," returns empty even with the app running and producing output.",[10,852,853],{},[21,854,29],{},[31,856,858],{"className":33,"code":857,"language":35,"meta":36,"style":36},"docker inspect \u003Ccontainer-id> | grep LogConfig\n",[38,859,860],{"__ignoreMap":36},[41,861,862,865,868,870,873,875,877,879,881],{"class":43,"line":44},[41,863,864],{"class":47},"docker",[41,866,867],{"class":51}," inspect",[41,869,152],{"class":80},[41,871,872],{"class":51},"container-i",[41,874,600],{"class":119},[41,876,254],{"class":80},[41,878,81],{"class":80},[41,880,84],{"class":47},[41,882,883],{"class":51}," LogConfig\n",[10,885,886,887,890],{},"If you see ",[38,888,889],{},"\"Type\": \"none\""," or an unsupported driver, that's the problem.",[10,892,893],{},[21,894,95],{},[10,896,897],{},"Configure the default log driver on the machine:",[31,899,903],{"className":900,"code":901,"language":902,"meta":36,"style":36},"language-json shiki shiki-themes github-dark-default","\u002F\u002F \u002Fetc\u002Fdocker\u002Fdaemon.json\n{\n  \"log-driver\": \"json-file\",\n  \"log-opts\": {\n    \"max-size\": \"100m\",\n    \"max-file\": \"3\"\n  }\n}\n","json",[38,904,905,910,915,928,936,948,958,963],{"__ignoreMap":36},[41,906,907],{"class":43,"line":44},[41,908,909],{"class":65},"\u002F\u002F \u002Fetc\u002Fdocker\u002Fdaemon.json\n",[41,911,912],{"class":43,"line":62},[41,913,914],{"class":119},"{\n",[41,916,917,920,922,925],{"class":43,"line":69},[41,918,919],{"class":115},"  \"log-driver\"",[41,921,128],{"class":119},[41,923,924],{"class":51},"\"json-file\"",[41,926,927],{"class":119},",\n",[41,929,930,933],{"class":43,"line":231},[41,931,932],{"class":115},"  \"log-opts\"",[41,934,935],{"class":119},": {\n",[41,937,938,941,943,946],{"class":43,"line":237},[41,939,940],{"class":115},"    \"max-size\"",[41,942,128],{"class":119},[41,944,945],{"class":51},"\"100m\"",[41,947,927],{"class":119},[41,949,950,953,955],{"class":43,"line":260},[41,951,952],{"class":115},"    \"max-file\"",[41,954,128],{"class":119},[41,956,957],{"class":51},"\"3\"\n",[41,959,960],{"class":43,"line":278},[41,961,962],{"class":119},"  }\n",[41,964,965],{"class":43,"line":296},[41,966,967],{"class":119},"}\n",[10,969,970],{},"Restart the service:",[31,972,974],{"className":33,"code":973,"language":35,"meta":36,"style":36},"sudo systemctl restart docker\n",[38,975,976],{"__ignoreMap":36},[41,977,978,980,982,985],{"class":43,"line":44},[41,979,48],{"class":47},[41,981,168],{"class":51},[41,983,984],{"class":51}," restart",[41,986,987],{"class":51}," docker\n",[14,989,991],{"id":990},"_9-postgres-connection-timing-out","9. Postgres connection timing out",[10,993,994,996,997,1000,1001,1004],{},[21,995,23],{}," the app logs ",[38,998,999],{},"connection timeout"," or ",[38,1002,1003],{},"too many clients"," when connecting to the database.",[10,1006,1007],{},[21,1008,29],{},[10,1010,1011],{},"On Postgres:",[31,1013,1017],{"className":1014,"code":1015,"language":1016,"meta":36,"style":36},"language-sql shiki shiki-themes github-dark-default","SELECT count(*) FROM pg_stat_activity;\nSHOW max_connections;\n","sql",[38,1018,1019,1042],{"__ignoreMap":36},[41,1020,1021,1024,1027,1030,1033,1036,1039],{"class":43,"line":44},[41,1022,1023],{"class":80},"SELECT",[41,1025,1026],{"class":55}," count",[41,1028,1029],{"class":119},"(",[41,1031,1032],{"class":80},"*",[41,1034,1035],{"class":119},") ",[41,1037,1038],{"class":80},"FROM",[41,1040,1041],{"class":119}," pg_stat_activity;\n",[41,1043,1044],{"class":43,"line":62},[41,1045,1046],{"class":119},"SHOW max_connections;\n",[10,1048,1049,1050,1053,1054,1057],{},"If ",[38,1051,1052],{},"count(*)"," is close to ",[38,1055,1056],{},"max_connections",", the pool is saturated.",[10,1059,1060],{},[21,1061,95],{},[10,1063,1064],{},"Put a pgbouncer between app and database:",[31,1066,1068],{"className":101,"code":1067,"language":103,"meta":36,"style":36},"# job pgbouncer\nconfig:\n  max_client_conn: 1000\n  default_pool_size: 25\n",[38,1069,1070,1075,1082,1092],{"__ignoreMap":36},[41,1071,1072],{"class":43,"line":44},[41,1073,1074],{"class":65},"# job pgbouncer\n",[41,1076,1077,1080],{"class":43,"line":62},[41,1078,1079],{"class":115},"config",[41,1081,120],{"class":119},[41,1083,1084,1087,1089],{"class":43,"line":69},[41,1085,1086],{"class":115},"  max_client_conn",[41,1088,128],{"class":119},[41,1090,1091],{"class":55},"1000\n",[41,1093,1094,1097,1099],{"class":43,"line":231},[41,1095,1096],{"class":115},"  default_pool_size",[41,1098,128],{"class":119},[41,1100,1101],{"class":55},"25\n",[10,1103,1104],{},"And point the apps at pgbouncer instead of the database directly. You can serve thousands of client connections with a few real database connections.",[14,1106,1108],{"id":1107},"_10-cluster-appears-to-have-two-coordinators","10. Cluster appears to have two coordinators",[10,1110,1111,1113],{},[21,1112,23],{}," strange behaviors — writes on one node don't show up on the other. Inconsistent metrics across panels.",[10,1115,1116],{},[21,1117,29],{},[31,1119,1121],{"className":33,"code":1120,"language":35,"meta":36,"style":36},"heroctl cluster peers\n",[38,1122,1123],{"__ignoreMap":36},[41,1124,1125,1127,1129],{"class":43,"line":44},[41,1126,213],{"class":47},[41,1128,216],{"class":51},[41,1130,1131],{"class":51}," peers\n",[10,1133,1134],{},"If the peer list varies depending on the node you query, there was a network split and two halves thought they were the good half.",[10,1136,1137],{},[21,1138,95],{},[10,1140,1141],{},"Identify the minority half (the one with fewer nodes) and restart those nodes:",[31,1143,1145],{"className":33,"code":1144,"language":35,"meta":36,"style":36},"sudo systemctl restart heroctl-server\n",[38,1146,1147],{"__ignoreMap":36},[41,1148,1149,1151,1153,1155],{"class":43,"line":44},[41,1150,48],{"class":47},[41,1152,168],{"class":51},[41,1154,984],{"class":51},[41,1156,174],{"class":51},[10,1158,1159],{},"They re-sync with the majority half and the inconsistency goes away. Then check whether any data diverged during the interval:",[31,1161,1163],{"className":33,"code":1162,"language":35,"meta":36,"style":36},"heroctl jobs status --all | grep -i diverge\n",[38,1164,1165],{"__ignoreMap":36},[41,1166,1167,1169,1171,1174,1177,1179,1181,1183],{"class":43,"line":44},[41,1168,213],{"class":47},[41,1170,505],{"class":51},[41,1172,1173],{"class":51}," status",[41,1175,1176],{"class":55}," --all",[41,1178,81],{"class":80},[41,1180,84],{"class":47},[41,1182,56],{"class":55},[41,1184,1185],{"class":51}," diverge\n",[14,1187,1189],{"id":1188},"_11-disk-full","11. Disk full",[10,1191,1192,1194,1195,1198],{},[21,1193,23],{}," the node starts misbehaving. Slow API. Agent restarts containers for no apparent reason. ",[38,1196,1197],{},"df -h"," shows 100%.",[10,1200,1201],{},[21,1202,29],{},[31,1204,1206],{"className":33,"code":1205,"language":35,"meta":36,"style":36},"sudo du -sh \u002Fvar\u002Flib\u002Fheroctl\u002F* | sort -h\nsudo du -sh \u002Fvar\u002Flog\u002F* | sort -h\n",[38,1207,1208,1231],{"__ignoreMap":36},[41,1209,1210,1212,1215,1218,1221,1223,1225,1228],{"class":43,"line":44},[41,1211,48],{"class":47},[41,1213,1214],{"class":51}," du",[41,1216,1217],{"class":55}," -sh",[41,1219,1220],{"class":51}," \u002Fvar\u002Flib\u002Fheroctl\u002F",[41,1222,1032],{"class":55},[41,1224,81],{"class":80},[41,1226,1227],{"class":47}," sort",[41,1229,1230],{"class":55}," -h\n",[41,1232,1233,1235,1237,1239,1242,1244,1246,1248],{"class":43,"line":62},[41,1234,48],{"class":47},[41,1236,1214],{"class":51},[41,1238,1217],{"class":55},[41,1240,1241],{"class":51}," \u002Fvar\u002Flog\u002F",[41,1243,1032],{"class":55},[41,1245,81],{"class":80},[41,1247,1227],{"class":47},[41,1249,1230],{"class":55},[10,1251,1252],{},"The usual culprits are old logs and uncleaned snapshots.",[10,1254,1255],{},[21,1256,95],{},[10,1258,1259],{},"Configure rotation:",[31,1261,1263],{"className":101,"code":1262,"language":103,"meta":36,"style":36},"# \u002Fetc\u002Fheroctl\u002Fserver.yaml\nlogs:\n  retention_days: 7\n  max_size_per_alloc_mb: 500\n\nsnapshots:\n  retention_count: 10\n",[38,1264,1265,1269,1276,1286,1296,1300,1307],{"__ignoreMap":36},[41,1266,1267],{"class":43,"line":44},[41,1268,110],{"class":65},[41,1270,1271,1274],{"class":43,"line":62},[41,1272,1273],{"class":115},"logs",[41,1275,120],{"class":119},[41,1277,1278,1281,1283],{"class":43,"line":69},[41,1279,1280],{"class":115},"  retention_days",[41,1282,128],{"class":119},[41,1284,1285],{"class":55},"7\n",[41,1287,1288,1291,1293],{"class":43,"line":231},[41,1289,1290],{"class":115},"  max_size_per_alloc_mb",[41,1292,128],{"class":119},[41,1294,1295],{"class":55},"500\n",[41,1297,1298],{"class":43,"line":237},[41,1299,228],{"emptyLinePlaceholder":227},[41,1301,1302,1305],{"class":43,"line":260},[41,1303,1304],{"class":115},"snapshots",[41,1306,120],{"class":119},[41,1308,1309,1312,1314],{"class":43,"line":278},[41,1310,1311],{"class":115},"  retention_count",[41,1313,128],{"class":119},[41,1315,1316],{"class":55},"10\n",[10,1318,1319],{},"And an immediate manual cleanup:",[31,1321,1323],{"className":33,"code":1322,"language":35,"meta":36,"style":36},"sudo journalctl --vacuum-time=3d\nheroctl snapshot prune --keep 10\n",[38,1324,1325,1335],{"__ignoreMap":36},[41,1326,1327,1329,1332],{"class":43,"line":44},[41,1328,48],{"class":47},[41,1330,1331],{"class":51}," journalctl",[41,1333,1334],{"class":55}," --vacuum-time=3d\n",[41,1336,1337,1339,1341,1344,1347],{"class":43,"line":62},[41,1338,213],{"class":47},[41,1340,450],{"class":51},[41,1342,1343],{"class":51}," prune",[41,1345,1346],{"class":55}," --keep",[41,1348,1349],{"class":55}," 10\n",[14,1351,1353],{"id":1352},"_12-container-killed-for-lack-of-memory","12. Container killed for lack of memory",[10,1355,1356,481,1358,1360,1361,1364],{},[21,1357,23],{},[38,1359,849],{}," ends with ",[38,1362,1363],{},"OOMKilled",". The container restarts in a loop.",[10,1366,1367],{},[21,1368,29],{},[31,1370,1372],{"className":33,"code":1371,"language":35,"meta":36,"style":36},"heroctl alloc status \u003Cid> | grep -A5 \"memory\"\n",[38,1373,1374],{"__ignoreMap":36},[41,1375,1376,1378,1381,1383,1385,1388,1390,1392,1394,1396,1399],{"class":43,"line":44},[41,1377,213],{"class":47},[41,1379,1380],{"class":51}," alloc",[41,1382,1173],{"class":51},[41,1384,152],{"class":80},[41,1386,1387],{"class":51},"i",[41,1389,600],{"class":119},[41,1391,254],{"class":80},[41,1393,81],{"class":80},[41,1395,84],{"class":47},[41,1397,1398],{"class":55}," -A5",[41,1400,1401],{"class":51}," \"memory\"\n",[10,1403,1404],{},"Compare actual usage with the defined limit.",[10,1406,1407],{},[21,1408,95],{},[10,1410,1411],{},"Raise the limit in the job spec:",[31,1413,1415],{"className":101,"code":1414,"language":103,"meta":36,"style":36},"resources:\n  memory_mb: 1024    # era 512\n",[38,1416,1417,1423],{"__ignoreMap":36},[41,1418,1419,1421],{"class":43,"line":44},[41,1420,539],{"class":115},[41,1422,120],{"class":119},[41,1424,1425,1427,1429,1432],{"class":43,"line":62},[41,1426,559],{"class":115},[41,1428,128],{"class":119},[41,1430,1431],{"class":55},"1024",[41,1433,1434],{"class":65},"    # era 512\n",[10,1436,1437],{},"Submit the new version:",[31,1439,1441],{"className":33,"code":1440,"language":35,"meta":36,"style":36},"heroctl jobs submit meu-app.json\n",[38,1442,1443],{"__ignoreMap":36},[41,1444,1445,1447,1449,1452],{"class":43,"line":44},[41,1446,213],{"class":47},[41,1448,505],{"class":51},[41,1450,1451],{"class":51}," submit",[41,1453,1454],{"class":51}," meu-app.json\n",[10,1456,1457],{},"If memory usage grows over time (a leak), raising the limit only delays the problem. Investigate the app.",[14,1459,1461],{"id":1460},"when-none-of-this-helps","When none of this helps",[10,1463,1464],{},"Gather the following information before opening a ticket:",[319,1466,1467,1473,1479,1486],{},[322,1468,1469,1472],{},[38,1470,1471],{},"heroctl cluster status"," (full output)",[322,1474,1475,1478],{},[38,1476,1477],{},"heroctl version"," on every node",[322,1480,1481,1482,1485],{},"The ",[38,1483,1484],{},"request_id"," returned by the API error",[322,1487,1488],{},"Log excerpt with the timestamp of the incident",[10,1490,1491,1492,1498],{},"Send to ",[21,1493,1494],{},[332,1495,1497],{"href":1496},"mailto:suporte@heroctl.com","suporte@heroctl.com"," with this information in the message body. The more context, the faster the response.",[14,1500,1502],{"id":1501},"next-steps","Next steps",[319,1504,1505,1512,1518],{},[322,1506,1507,1511],{},[332,1508,1510],{"href":1509},"\u002Fen\u002Fdocs\u002Fobservability\u002Fmetrics-logs","Metrics and alerts"," — detect problems before users do.",[322,1513,1514,1517],{},[332,1515,1516],{"href":471},"Backup and restore"," — preparation for the worst scenarios.",[322,1519,1520,1524],{},[332,1521,1523],{"href":1522},"\u002Fen\u002Fdocs\u002Fapi\u002Fapi-reference","API reference"," — when the CLI isn't enough.",[1526,1527,1528],"style",{},"html pre.shiki code .sQhOw, html code.shiki .sQhOw{--shiki-default:#FFA657}html pre.shiki code .s9uIt, html code.shiki .s9uIt{--shiki-default:#A5D6FF}html pre.shiki code .sFSAA, html code.shiki .sFSAA{--shiki-default:#79C0FF}html pre.shiki code .sH3jZ, html code.shiki .sH3jZ{--shiki-default:#8B949E}html pre.shiki code .suJrU, html code.shiki .suJrU{--shiki-default:#FF7B72}html .default .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html pre.shiki code .sPWt5, html code.shiki .sPWt5{--shiki-default:#7EE787}html pre.shiki code .sZEs4, html code.shiki .sZEs4{--shiki-default:#E6EDF3}",{"title":36,"searchDepth":62,"depth":62,"links":1530},[1531,1532,1533,1534,1535,1536,1537,1538,1539,1540,1541,1542,1543,1544],{"id":16,"depth":62,"text":17},{"id":177,"depth":62,"text":178},{"id":376,"depth":62,"text":377},{"id":475,"depth":62,"text":476},{"id":570,"depth":62,"text":571},{"id":694,"depth":62,"text":695},{"id":766,"depth":62,"text":767},{"id":841,"depth":62,"text":842},{"id":990,"depth":62,"text":991},{"id":1107,"depth":62,"text":1108},{"id":1188,"depth":62,"text":1189},{"id":1352,"depth":62,"text":1353},{"id":1460,"depth":62,"text":1461},{"id":1501,"depth":62,"text":1502},"troubleshooting","The 12 most frequent problems in HeroCtl clusters, with symptom, diagnosis, and step-by-step fix.",false,"md","i-lucide-alert-triangle","2026-04-26",{},"\u002Fen\u002Fdocs\u002Ftroubleshooting\u002Fcommon-problems",[],"10 min read",{"title":5,"description":1546},"en\u002Fdocs\u002Ftroubleshooting\u002Fcommon-problems",[1545,1558,1559,1560],"diagnosis","operations","incidents","XcfcByB1KlfQs1s5JrNPceTmR9dyVl-sG7vJXRXOD50",[1563,1567,1573,1578,1583,1588,1593,1597,1603,1608,1613,1617,1623,1628],{"path":1522,"title":1564,"description":1565,"category":116,"order":44,"icon":1566},"REST API reference","Endpoints, JWT authentication, curl examples, and error patterns of the HeroCtl API.","i-lucide-code",{"path":1568,"title":1569,"description":1570,"category":1571,"order":44,"icon":1572},"\u002Fen\u002Fdocs\u002Fdeploy\u002Ffirst-deploy","Deploy your first app","Bring up a Node.js application with a Postgres database in 50 lines of YAML. Includes health check, rolling deploy, and rollback.","deploy","i-lucide-rocket",{"path":1574,"title":1575,"description":1576,"category":1571,"order":62,"icon":1577},"\u002Fen\u002Fdocs\u002Fdeploy\u002Frolling-canary-blue-green","Rolling, canary, blue-green, and rainbow","Four deploy strategies. When to use each, with complete examples and honest trade-offs.","i-lucide-git-branch",{"path":334,"title":1579,"description":1580,"category":1581,"order":62,"icon":1582},"Firewall configuration","Which ports HeroCtl uses, which need to stay open, and which should never be exposed to the internet.","rede","i-lucide-shield",{"path":1584,"title":1585,"description":1586,"category":1581,"order":44,"icon":1587},"\u002Fen\u002Fdocs\u002Fnetworking\u002Fingress-tls","Ingress and automatic TLS","How to expose applications on port 443 with certificates issued and renewed automatically, without operating an external router.","i-lucide-globe",{"path":471,"title":1589,"description":1590,"category":1591,"order":62,"icon":1592},"Backup and restore of cluster state","How to save, schedule, and restore HeroCtl control plane snapshots. Disaster recovery strategy.","observabilidade","i-lucide-archive",{"path":1509,"title":1594,"description":1595,"category":1591,"order":44,"icon":1596},"Metrics and logs","Collect metrics, logs, and traces without standing up an external observability stack. When it's worth it, and when to integrate with an outside tool.","i-lucide-activity",{"path":1598,"title":1599,"description":1600,"category":1601,"order":69,"icon":1602},"\u002Fen\u002Fdocs\u002Foperations\u002Fcli-reference","Complete CLI reference","All heroctl commands with synopsis, flags, and example. Use as a desk reference.","operacoes","i-lucide-terminal",{"path":1604,"title":1605,"description":1606,"category":1601,"order":62,"icon":1607},"\u002Fen\u002Fdocs\u002Foperations\u002Ffirst-cluster","Bring up a 3-node cluster","Form a cluster with 3 servers in under 10 minutes. Tolerates 1-node failure with no downtime.","i-lucide-network",{"path":1609,"title":1610,"description":1611,"category":1601,"order":44,"icon":1612},"\u002Fen\u002Fdocs\u002Foperations\u002Finstallation","Installation","Install HeroCtl on any Linux server with Docker in a single command. Covers prerequisites, bootstrap, and verification.","i-lucide-download",{"path":1614,"title":1615,"description":1616,"category":1601,"order":231,"icon":1587},"\u002Fen\u002Fdocs\u002Foperations\u002Fmulti-region","Multi-region (planned for Q4 2026)","What to expect from multi-region in HeroCtl, how to run across regions today, and the roadmap through 2027.",{"path":1618,"title":1619,"description":1620,"category":1621,"order":62,"icon":1622},"\u002Fen\u002Fdocs\u002Fsecurity\u002Frbac","RBAC and access control (Business+)","Role, policy, and token model to limit who can submit, read, and operate the cluster.","seguranca","i-lucide-users",{"path":1624,"title":1625,"description":1626,"category":1621,"order":44,"icon":1627},"\u002Fen\u002Fdocs\u002Fsecurity\u002Fsecrets","Secret management","How to keep passwords, tokens, and keys outside the job spec, with encryption at rest and versioned rotation.","i-lucide-key",{"path":1552,"title":5,"description":1546,"category":1545,"order":44,"icon":1549},1777362181977]